@article{cicek_ning_ozturk_shen_2022, title={General Reuse-Centric CNN Accelerator}, volume={71}, ISSN={["1557-9956"]}, url={https://doi.org/10.1109/TC.2021.3064608}, DOI={10.1109/TC.2021.3064608}, abstractNote={This article introduces the first general reuse-centric accelerator for CNN inferences. Unlike prior work that exploits similarities only across consecutive video frames, general reuse-centric accelerator is able to discover similarities among arbitrary patches within an image or across independent images, and translate them into computation time and energy savings. Experiments show that the accelerator complements both prior software-based CNN and various CNN hardware accelerators, producing up to 14.96X speedups for similarity discovery, up to 2.70X speedups for overall inference.}, number={4}, journal={IEEE TRANSACTIONS ON COMPUTERS}, publisher={Institute of Electrical and Electronics Engineers (IEEE)}, author={Cicek, Nihat Mert and Ning, Lin and Ozturk, Ozcan and Shen, Xipeng}, year={2022}, month={Apr}, pages={880–891} } @article{guan_chaudhary_xu_ning_zhang_shen_2021, title={Recurrent Neural Networks Meet Context-Free Grammar: Two Birds with One Stone}, ISSN={["1550-4786"]}, DOI={10.1109/ICDM51629.2021.00125}, abstractNote={Recurrent Neural Networks (RNN) are widely used for various prediction tasks on sequences such as text, speed signals, program traces, and system logs. Due to RNNs’ inherently sequential behavior, one key challenge for the effective adoption of RNNs is to reduce the time spent on RNN inference and to increase the scope of a prediction. This work introduces CFG-guided compressed learning, an approach that creatively integrates Context-Free Grammar (CFG) and online tokenization into RNN learning and inference for streaming inputs. Through a hierarchical compression algorithm, it compresses an input sequence to a CFG and makes predictions based on the compressed sequence. Its algorithm design employs a set of techniques to overcome the issues from the myopic nature of online tokenization, the tension between inference accuracy and compression rate, and other complexities. Experiments on 16 real-world sequences of various types validate that the proposed compressed learning can successfully recognize and leverage repetitive patterns in input sequences, and effectively translate them into dramatic (1-1762×) inference speedups as well as much (1-7830×) expanded prediction scope, while keeping the inference accuracy satisfactory.}, journal={2021 21ST IEEE INTERNATIONAL CONFERENCE ON DATA MINING (ICDM 2021)}, author={Guan, Hui and Chaudhary, Umang and Xu, Yuanchao and Ning, Lin and Zhang, Lijun and Shen, Xipeng}, year={2021}, pages={1078–1083} } @article{ning_guan_shen_2019, title={Adaptive Deep Reuse: Accelerating CNN Training on the Fly}, ISSN={["1084-4627"]}, DOI={10.1109/ICDE.2019.00138}, abstractNote={This work proposes adaptive deep reuse, a method for accelerating CNN training by identifying and avoiding the unnecessary computations contained in each specific training on the fly. It makes two-fold major contributions. (1) It empirically proves the existence of a lot of similarities among neuron vectors in both forward and backward propagation of CNN. (2) It introduces the first adaptive strategy for translating the similarities into computation reuse in CNN training. The strategy adaptively adjusts the strength of reuse based on the different tolerance of precision relaxation in different CNN training stages. Experiments show that adaptive deep reuse saves 69% CNN training time with no accuracy loss.}, journal={2019 IEEE 35TH INTERNATIONAL CONFERENCE ON DATA ENGINEERING (ICDE 2019)}, author={Ning, Lin and Guan, Hui and Shen, Xipeng}, year={2019}, pages={1538–1549} } @article{ning_pittman_shen_2018, title={LCD: A Fast Contrastive Divergence Based Algorithm for Restricted Boltzmann Machine}, volume={108}, ISSN={["1879-2782"]}, DOI={10.1016/j.neunet.2018.08.018}, abstractNote={Restricted Boltzmann Machine (RBM) is the building block of Deep Belief Nets and other deep learning tools. Fast learning and prediction are both essential for practical usage of RBM-based machine learning techniques. This paper proposes Lean Contrastive Divergence (LCD), a modified Contrastive Divergence (CD) algorithm, to accelerate RBM learning and prediction without changing the results. LCD avoids most of the required computations with two optimization techniques. The first is called bounds-based filtering, which, through triangle inequality, replaces expensive calculations of many vector dot products with fast bounds calculations. The second is delta product, which effectively detects and avoids many repeated calculations in the core operation of RBM, Gibbs Sampling. The optimizations are applicable to both the standard contrastive divergence learning algorithm and its variations. In addition, this paper presents how to implement these optimizations effectively on massively parallel processors. Results show that the optimizations can produce several-fold (up to 3X for training and 5.3X for prediction) speedups.}, journal={NEURAL NETWORKS}, author={Ning, Lin and Pittman, Randall and Shen, Xipeng}, year={2018}, month={Dec}, pages={399–410} } @inproceedings{ding_ning_guan_shen_2017, title={Generalizations of the theory and deployment of triangular inequality for compiler-based strength reduction}, volume={52}, DOI={10.1145/3140587.3062377}, abstractNote={Triangular Inequality (TI) has been used in many manual algorithm designs to achieve good efficiency in solving some distance calculation-based problems. This paper presents our generalization of the idea into a compiler optimization technique, named TI-based strength reduction. The generalization consists of three parts. The first is the establishment of the theoretic foundation of this new optimization via the development of a new form of TI named Angular Triangular Inequality, along with several fundamental theorems. The second is the revealing of the properties of the new forms of TI and the proposal of guided TI adaptation, a systematic method to address the difficulties in effective deployments of TI optimizations. The third is an integration of the new optimization technique in an open-source compiler. Experiments on a set of data mining and machine learning algorithms show that the new technique can speed up the standard implementations by as much as 134X and 46X on average for distance-related problems, outperforming previous TI-based optimizations by 2.35X on average. It also extends the applicability of TI-based optimizations to vector related problems, producing tens of times of speedup.}, number={6}, booktitle={ACM SIGPLAN Notices}, author={Ding, Y. F. and Ning, L. and Guan, H. and Shen, Xipeng}, year={2017}, pages={33–48} } @article{ning_pittman_shen_2017, title={LCD: A Fast Contrastive Divergence Based Algorithm for Restricted Boltzmann Machine}, ISSN={["1550-4786"]}, DOI={10.1109/icdm.2017.131}, abstractNote={Restricted Boltzmann Machine (RBM) is the building block of Deep Belief Nets and other deep learning tools. Fast learning and prediction are both essential for practical usage of RBM-based machine learning techniques. This paper proposes Lean Contrastive Divergence (LCD), a modified Contrastive Divergence (CD) algorithm, to accelerate RBM learning and prediction without changing the results. LCD avoids most of the required computations with two optimization techniques. The first is called bounds-based filtering, which, through triangle inequality, replaces expensive calculations of many vector dot products with fast bounds calculations. The second is delta product, which effectively detects and avoids many repeated calculations in the core operation of RBM, Gibbs Sampling. The optimizations are applicable to both the standard contrastive divergence learning algorithm and its variations. Results show that the optimizations can produce several-fold (up to 3X for training and 5.3X for prediction) speedups.}, journal={2017 17TH IEEE INTERNATIONAL CONFERENCE ON DATA MINING (ICDM)}, author={Ning, Lin and Pittman, Randall and Shen, Xipeng}, year={2017}, pages={1015–1020} }