@article{li_franzon_dey_schabel_2022, title={Hardware Implementation of Hierarchical Temporal Memory Algorithm}, volume={18}, ISSN={["1550-4840"]}, DOI={10.1145/3479430}, abstractNote={ Hierarchical temporal memory (HTM) is an un-supervised machine learning algorithm that can learn both spatial and temporal information of input. It has been successfully applied to multiple areas. In this paper, we propose a multi-level hierarchical ASIC implementation of HTM, referred to as processor core, to support both spatial and temporal pooling. To improve the unbalanced workload in HTM, the proposed design provides different mapping methods for the spatial and temporal pooling, respectively. In the proposed design, we implement a distributed memory system by assigning one dedicated memory bank to each level of hierarchy to improve the memory bandwidth utilization efficiency. Finally, the hot-spot operations are optimized using a series of customized units. Regarding scalability, we propose a ring-based network consisting of multiple processor cores to support a larger HTM network. To evaluate the performance of our proposed design, we map an HTM network that includes 2,048 columns and 65,536 cells on both the proposed design and NVIDIA Tesla K40c GPU using the KTH database as input. The latency and power of the proposed design is 6.04 ms and 4.1 W using GP 65 nm technology. Compared to the equivalent GPU implementation, the latency and power is improved 12.45× and 57.32×, respectively. }, number={1}, journal={ACM JOURNAL ON EMERGING TECHNOLOGIES IN COMPUTING SYSTEMS}, author={Li, Weifu and Franzon, Paul and Dey, Sumon and Schabel, Joshua}, year={2022}, month={Jan} } @article{dey_baker_schabel_li_franzon_2021, title={A Scalable Cluster-based Hierarchical Hardware Accelerator for a Cortically Inspired Algorithm}, volume={17}, ISSN={["1550-4840"]}, DOI={10.1145/3447777}, abstractNote={This article describes a scalable, configurable and cluster-based hierarchical hardware accelerator through custom hardware architecture for Sparsey, a cortical learning algorithm. Sparsey is inspired by the operation of the human cortex and uses a Sparse Distributed Representation to enable unsupervised learning and inference in the same algorithm. A distributed on-chip memory organization is designed and implemented in custom hardware to improve memory bandwidth and accelerate the memory read/write operations for synaptic weight matrices. Bit-level data are processed from distributed on-chip memory and custom multiply-accumulate hardware is implemented for binary and fixed-point multiply-accumulation operations. The fixed-point arithmetic and fixed-point storage are also adapted in this implementation. At 16 nm, the custom hardware of Sparsey achieved an overall 24.39× speedup, 353.12× energy efficiency per frame, and 1.43× reduction in silicon area against a state-of-the-art GPU.}, number={4}, journal={ACM JOURNAL ON EMERGING TECHNOLOGIES IN COMPUTING SYSTEMS}, author={Dey, Sumon and Baker, Lee and Schabel, Joshua and Li, Weifu and Franzon, Paul D.}, year={2021}, month={Oct} } @inproceedings{li_franzon, title={Hardware implementation of hierarchical temporal memory algorithm}, booktitle={2016 29th IEEE International System-on-Chip Conference (SOCC)}, author={Li, W. F. and Franzon, P.}, pages={133–138} } @inproceedings{li_franzon, title={Hardware implementation of hierarchical temporal memory algorithm}, booktitle={2016 29th IEEE International System-on-Chip Conference (SOCC)}, author={Li, W. F. and Franzon, P.}, pages={133–138} } @inproceedings{schabel_baker_dey_li_franzon, title={Processor-in-memory support for artificial neural networks}, booktitle={2016 IEEE International Conference on Rebooting Computing (icrc)}, author={Schabel, J. and Baker, L. and Dey, S. and Li, W. F. and Franzon, P. D.} }