@article{chen_sung_shen_tallent_barker_li_2023, title={Accelerating matrix-centric graph processing on GPUs through bit-level optimizations}, volume={177}, ISSN={["1096-0848"]}, DOI={10.1016/j.jpdc.2023.02.013}, abstractNote={Even though it is well known that binary values are common in graph applications (e.g., adjacency matrix), how to leverage the phenomenon for efficiency has not yet been adequately explored. This paper presents a systematic study on how to unlock the potential of the bit-level optimizations of graph computations that involve binary values. It proposes a two-level representation named Bit-Block Compressed Sparse Row (B2SR) and presents a series of optimizations to the graph operations on B2SR by the intrinsics of modern GPUs. It additionally introduces Deep Reinforcement Learning (DRL) as an efficient way to best configure the bit-level optimizations on the fly. The DQN-based adaptive tile size selector with dedicated model training can reach 68% prediction accuracy. Evaluations on the NVIDIA Pascal and Volta GPUs show that the optimizations bring up to 40 and 6555 for essential GraphBLAS kernels SpMV and SpGEMM, respectively, accelerating GraphBLAS-based BFS by up to 433, SSSP, PR, and CC 35, and TC 52.}, journal={JOURNAL OF PARALLEL AND DISTRIBUTED COMPUTING}, author={Chen, Jou-An and Sung, Hsin-Hsuan and Shen, Xipeng and Tallent, Nathan and Barker, Kevin and Li, Ang}, year={2023}, month={Jul}, pages={53–67} } @article{chen_sung_shen_choudhury_li_2023, title={BitGNN: Unleashing the Performance Potential of Binary Graph Neural Networks on GPUs}, url={https://doi.org/10.1145/3577193.3593725}, DOI={10.1145/3577193.3593725}, abstractNote={Recent studies have shown that Binary Graph Neural Networks (GNNs) are promising for saving computations of GNNs through binarized tensors. Prior work, however, mainly focused on algorithm designs or training techniques, leaving it open to how to materialize the performance potential on accelerator hardware fully. This work redesigns the binary GNN inference backend from the efficiency perspective. It fills the gap by proposing a series of abstractions and techniques to map binary GNNs and their computations best to fit the nature of bit manipulations on GPUs. Results on real-world graphs with GCNs, GraphSAGE, and GraphSAINT show that the proposed techniques outperform state-of-the-art binary GNN implementations by 8-22X with the same accuracy maintained. BitGNN code is publicly available.1.}, journal={PROCEEDINGS OF THE 37TH INTERNATIONAL CONFERENCE ON SUPERCOMPUTING, ACM ICS 2023}, author={Chen, Jou-An and Sung, Hsin-Hsuan and Shen, Xipeng and Choudhury, Sutanay and Li, Ang}, year={2023}, pages={264–276} } @article{chen_niu_ren_wang_shen_2023, title={Survey: Exploiting Data Redundancy for Optimization of Deep Learning}, volume={55}, ISSN={["1557-7341"]}, DOI={10.1145/3564663}, abstractNote={ Data redundancy is ubiquitous in the inputs and intermediate results of Deep Neural Networks (DNN) . It offers many significant opportunities for improving DNN performance and efficiency and has been explored in a large body of work. These studies have scattered in many venues across several years. The targets they focus on range from images to videos and texts, and the techniques they use to detect and exploit data redundancy also vary in many aspects. There is not yet a systematic examination and summary of the many efforts, making it difficult for researchers to get a comprehensive view of the prior work, the state of the art, differences and shared principles, and the areas and directions yet to explore. This article tries to fill the void. It surveys hundreds of recent papers on the topic, introduces a novel taxonomy to put the various techniques into a single categorization framework, offers a comprehensive description of the main methods used for exploiting data redundancy in improving multiple kinds of DNNs on data, and points out a set of research opportunities for future exploration. }, number={10}, journal={ACM COMPUTING SURVEYS}, author={Chen, Jou-An and Niu, Wei and Ren, Bin and Wang, Yanzhi and Shen, Xipeng}, year={2023}, month={Oct} } @article{chen_sung_shen_tallent_barker_li_2022, title={Bit-GraphBLAS: Bit-Level Optimizations of Matrix-Centric Graph Processing on GPU}, ISSN={["1530-2075"]}, DOI={10.1109/IPDPS53621.2022.00056}, abstractNote={In a general graph data structure like an adjacency matrix, when edges are homogeneous, the connectivity of two nodes can be sufficiently represented using a single bit. This insight has, however, not yet been adequately exploited by the existing matrix-centric graph processing frameworks. This work fills the void by systematically exploring the bit-level representation of graphs and the corresponding optimizations to the graph operations. It proposes a two-level representation named Bit-Block Compressed Sparse Row (B2SR) and presents a series of optimizations to the graph operations on B2SR by leveraging the intrinsics of modern GPUs. Evaluations on NVIDIA Pascal and Volta GPUs show that the optimizations bring up to 40× and 6555× for essential GraphBLAS kernels SpMV and SpGEMM, respectively, making GraphBLAS-based BFS accelerate up to 433×, SSSP, PR, and CC up to 35×, and TC up to 52×.}, journal={2022 IEEE 36TH INTERNATIONAL PARALLEL AND DISTRIBUTED PROCESSING SYMPOSIUM (IPDPS 2022)}, author={Chen, Jou-An and Sung, Hsin-Hsuan and Shen, Xipeng and Tallent, Nathan and Barker, Kevin and Li, Ang}, year={2022}, pages={515–525} }