@article{gajjar_kashyap_aysu_franzon_dey_cheng_2022, title={FAXID: FPGA-Accelerated XGBoost Inference for Data Centers using HLS}, ISSN={["2576-2621"]}, url={http://dx.doi.org/10.1109/fccm53951.2022.9786085}, DOI={10.1109/FCCM53951.2022.9786085}, abstractNote={Advanced ensemble trees have proven quite effective in providing real-time predictions against ransomware detection, medical diagnosis, recommendation engines, fraud detection, failure predictions, crime risk, to name a few. Especially, XGBoost, one of the most prominent and widely used decision trees, has gained popularity due to various optimizations on gradient boosting framework that provides increased accuracy for classification and regression problems. XGBoost’s ability to train relatively faster, handling missing values, flexibility and parallel processing make it a better candidate to handle data center workload. Today’s data centers with enormous Input/Output Operations per Second (IOPS) demand a real-time accelerated inference with low latency and high throughput because of significant data processing due to applications such as ransomware detection or fraud detection.This paper showcases an FPGA-based XGBoost accelerator designed with High-Level Synthesis (HLS) tools and design flow accelerating binary classification inference. We employ Alveo U50 and U200 to demonstrate the performance of the proposed design and compare it with existing state-of-the-art CPU (Intel Xeon E5-2686 v4) and GPU (Nvidia Tensor Core T4) implementations with relevant datasets. We show a latency speedup of our proposed design over state-of-art CPU and GPU implementations, including energy efficiency and cost-effectiveness. The proposed accelerator is up to 65.8x and 5.3x faster, in terms of latency than CPU and GPU, respectively. The Alveo U50 is a more cost-effective device, and the Alveo U200 stands out as more energy-efficient.}, journal={2022 IEEE 30TH INTERNATIONAL SYMPOSIUM ON FIELD-PROGRAMMABLE CUSTOM COMPUTING MACHINES (FCCM 2022)}, publisher={IEEE}, author={Gajjar, Archit and Kashyap, Priyank and Aysu, Aydin and Franzon, Paul and Dey, Sumon and Cheng, Chris}, year={2022}, pages={113–121} } @article{li_franzon_dey_schabel_2022, title={Hardware Implementation of Hierarchical Temporal Memory Algorithm}, volume={18}, ISSN={["1550-4840"]}, DOI={10.1145/3479430}, abstractNote={ Hierarchical temporal memory (HTM) is an un-supervised machine learning algorithm that can learn both spatial and temporal information of input. It has been successfully applied to multiple areas. In this paper, we propose a multi-level hierarchical ASIC implementation of HTM, referred to as processor core, to support both spatial and temporal pooling. To improve the unbalanced workload in HTM, the proposed design provides different mapping methods for the spatial and temporal pooling, respectively. In the proposed design, we implement a distributed memory system by assigning one dedicated memory bank to each level of hierarchy to improve the memory bandwidth utilization efficiency. Finally, the hot-spot operations are optimized using a series of customized units. Regarding scalability, we propose a ring-based network consisting of multiple processor cores to support a larger HTM network. To evaluate the performance of our proposed design, we map an HTM network that includes 2,048 columns and 65,536 cells on both the proposed design and NVIDIA Tesla K40c GPU using the KTH database as input. The latency and power of the proposed design is 6.04 ms and 4.1 W using GP 65 nm technology. Compared to the equivalent GPU implementation, the latency and power is improved 12.45× and 57.32×, respectively. }, number={1}, journal={ACM JOURNAL ON EMERGING TECHNOLOGIES IN COMPUTING SYSTEMS}, author={Li, Weifu and Franzon, Paul and Dey, Sumon and Schabel, Joshua}, year={2022}, month={Jan} } @article{kashyap_choi_dey_baron_wong_wu_cheng_franzon_2022, title={Modeling of Adaptive Receiver Performance Using Generative Adversarial Networks}, ISSN={["2377-5726"]}, url={http://dx.doi.org/10.1109/ectc51906.2022.00307}, DOI={10.1109/ECTC51906.2022.00307}, abstractNote={As the development of IBIS Algorithmic Modeling Interface (IBIS-AMI) models gets complex and requires time-consuming simulations, a data-driven and domain-independent approach can have tremendous value. This paper presents a data-driven approach to modeling a high-speed serializer/deserializer (SerDes) receiver through generative adversarial networks (GANs). In this work, the modeling considers multiple channels, random bitstreams, and varying decision feedback equalizer (DFE) tap values to predict an accurate bit error rate (BER) contour plot. We employ a discriminator structure that improves the training to generate a contour plot that makes it difficult to distinguish the ground truth. The generated plots’ bathtub curves strongly correlate to the ground truth bathtub curves and have a root-mean-squared error (RMSE) of 0.014, indicating a good fit.}, journal={IEEE 72ND ELECTRONIC COMPONENTS AND TECHNOLOGY CONFERENCE (ECTC 2022)}, publisher={IEEE}, author={Kashyap, Priyank and Choi, Yongjin and Dey, Sumon and Baron, Dror and Wong, Chau-Wai and Wu, Tianfu and Cheng, Chris and Franzon, Paul D.}, year={2022}, pages={1958–1963} } @article{dey_baker_schabel_li_franzon_2021, title={A Scalable Cluster-based Hierarchical Hardware Accelerator for a Cortically Inspired Algorithm}, volume={17}, ISSN={["1550-4840"]}, DOI={10.1145/3447777}, abstractNote={This article describes a scalable, configurable and cluster-based hierarchical hardware accelerator through custom hardware architecture for Sparsey, a cortical learning algorithm. Sparsey is inspired by the operation of the human cortex and uses a Sparse Distributed Representation to enable unsupervised learning and inference in the same algorithm. A distributed on-chip memory organization is designed and implemented in custom hardware to improve memory bandwidth and accelerate the memory read/write operations for synaptic weight matrices. Bit-level data are processed from distributed on-chip memory and custom multiply-accumulate hardware is implemented for binary and fixed-point multiply-accumulation operations. The fixed-point arithmetic and fixed-point storage are also adapted in this implementation. At 16 nm, the custom hardware of Sparsey achieved an overall 24.39× speedup, 353.12× energy efficiency per frame, and 1.43× reduction in silicon area against a state-of-the-art GPU.}, number={4}, journal={ACM JOURNAL ON EMERGING TECHNOLOGIES IN COMPUTING SYSTEMS}, author={Dey, Sumon and Baker, Lee and Schabel, Joshua and Li, Weifu and Franzon, Paul D.}, year={2021}, month={Oct} } @article{franzon_davis_rotenberg_stevens_lipa_nigussie_pan_baker_schabel_dey_et al._2021, title={Design for 3D Stacked Circuits}, ISSN={["2380-9248"]}, DOI={10.1109/IEDM19574.2021.9720553}, abstractNote={2.5D and 3D technologies can give rise to a node equivalent of scaling due to improved connectivity. Aggressive exploitation scenarios include functional partitioning, circuit partitioning, logic on DRAM, design obfuscation and modular chiplets. Design issues that need to be addressed in pursuing such exploitations include thermal management, design for test and computer aided design.}, journal={2021 IEEE INTERNATIONAL ELECTRON DEVICES MEETING (IEDM)}, author={Franzon, P. and Davis, W. and Rotenberg, E. and Stevens, J. and Lipa, S. and Nigussie, T. and Pan, H. and Baker, L. and Schabel, J. and Dey, S. and et al.}, year={2021} } @inproceedings{dey_franzon, title={Design and ASIC acceleration of cortical algorithm for text recognition}, booktitle={2016 29th IEEE International System-on-Chip Conference (SOCC)}, author={Dey, S. and Franzon, P. D.}, pages={114–119} } @inproceedings{dey_franzon, title={Design and ASIC acceleration of cortical algorithm for text recognition}, booktitle={2016 29th IEEE International System-on-Chip Conference (SOCC)}, author={Dey, S. and Franzon, P. D.}, pages={114–119} } @inproceedings{schabel_baker_dey_li_franzon, title={Processor-in-memory support for artificial neural networks}, booktitle={2016 IEEE International Conference on Rebooting Computing (icrc)}, author={Schabel, J. and Baker, L. and Dey, S. and Li, W. F. and Franzon, P. D.} }