@article{chen_sung_shen_tallent_barker_li_2023, title={Accelerating matrix-centric graph processing on GPUs through bit-level optimizations}, volume={177}, ISSN={["1096-0848"]}, DOI={10.1016/j.jpdc.2023.02.013}, abstractNote={Even though it is well known that binary values are common in graph applications (e.g., adjacency matrix), how to leverage the phenomenon for efficiency has not yet been adequately explored. This paper presents a systematic study on how to unlock the potential of the bit-level optimizations of graph computations that involve binary values. It proposes a two-level representation named Bit-Block Compressed Sparse Row (B2SR) and presents a series of optimizations to the graph operations on B2SR by the intrinsics of modern GPUs. It additionally introduces Deep Reinforcement Learning (DRL) as an efficient way to best configure the bit-level optimizations on the fly. The DQN-based adaptive tile size selector with dedicated model training can reach 68% prediction accuracy. Evaluations on the NVIDIA Pascal and Volta GPUs show that the optimizations bring up to 40 and 6555 for essential GraphBLAS kernels SpMV and SpGEMM, respectively, accelerating GraphBLAS-based BFS by up to 433, SSSP, PR, and CC 35, and TC 52.}, journal={JOURNAL OF PARALLEL AND DISTRIBUTED COMPUTING}, author={Chen, Jou-An and Sung, Hsin-Hsuan and Shen, Xipeng and Tallent, Nathan and Barker, Kevin and Li, Ang}, year={2023}, month={Jul}, pages={53–67} } @article{chen_sung_shen_choudhury_li_2023, title={BitGNN: Unleashing the Performance Potential of Binary Graph Neural Networks on GPUs}, url={https://doi.org/10.1145/3577193.3593725}, DOI={10.1145/3577193.3593725}, abstractNote={Recent studies have shown that Binary Graph Neural Networks (GNNs) are promising for saving computations of GNNs through binarized tensors. Prior work, however, mainly focused on algorithm designs or training techniques, leaving it open to how to materialize the performance potential on accelerator hardware fully. This work redesigns the binary GNN inference backend from the efficiency perspective. It fills the gap by proposing a series of abstractions and techniques to map binary GNNs and their computations best to fit the nature of bit manipulations on GPUs. Results on real-world graphs with GCNs, GraphSAGE, and GraphSAINT show that the proposed techniques outperform state-of-the-art binary GNN implementations by 8-22X with the same accuracy maintained. BitGNN code is publicly available.1.}, journal={PROCEEDINGS OF THE 37TH INTERNATIONAL CONFERENCE ON SUPERCOMPUTING, ACM ICS 2023}, author={Chen, Jou-An and Sung, Hsin-Hsuan and Shen, Xipeng and Choudhury, Sutanay and Li, Ang}, year={2023}, pages={264–276} } @article{chen_sung_shen_tallent_barker_li_2022, title={Bit-GraphBLAS: Bit-Level Optimizations of Matrix-Centric Graph Processing on GPU}, ISSN={["1530-2075"]}, DOI={10.1109/IPDPS53621.2022.00056}, abstractNote={In a general graph data structure like an adjacency matrix, when edges are homogeneous, the connectivity of two nodes can be sufficiently represented using a single bit. This insight has, however, not yet been adequately exploited by the existing matrix-centric graph processing frameworks. This work fills the void by systematically exploring the bit-level representation of graphs and the corresponding optimizations to the graph operations. It proposes a two-level representation named Bit-Block Compressed Sparse Row (B2SR) and presents a series of optimizations to the graph operations on B2SR by leveraging the intrinsics of modern GPUs. Evaluations on NVIDIA Pascal and Volta GPUs show that the optimizations bring up to 40× and 6555× for essential GraphBLAS kernels SpMV and SpGEMM, respectively, making GraphBLAS-based BFS accelerate up to 433×, SSSP, PR, and CC up to 35×, and TC up to 52×.}, journal={2022 IEEE 36TH INTERNATIONAL PARALLEL AND DISTRIBUTED PROCESSING SYMPOSIUM (IPDPS 2022)}, author={Chen, Jou-An and Sung, Hsin-Hsuan and Shen, Xipeng and Tallent, Nathan and Barker, Kevin and Li, Ang}, year={2022}, pages={515–525} } @article{sung_xu_guan_niu_ren_wang_liu_shen_2022, title={Brief Industry Paper: Enabling Level-4 Autonomous Driving on a Single $1k Off-the-Shelf Card}, ISSN={["1545-3421"]}, DOI={10.1109/RTAS54340.2022.00032}, abstractNote={In the past few years we have developed hardware computing systems for commercial autonomous vehicles, but inevitably the high development cost and long turn-around time have been major roadblocks for commercial deployment. Hence we also explored the potential of software optimization. This paper, for the first-time, shows that it is feasible to enable full leve1-4 autonomous driving workloads on a single off-the-shelf card (Jetson AGX Xavier) for less than ${\$}1\mathrm{k}$, an order of magnitude less than the state-of-the-art systems, while meeting all the requirements of latency. The success comes from the resolution of some important issues shared by existing practices through a series of measures and innovations.}, journal={2022 IEEE 28TH REAL-TIME AND EMBEDDED TECHNOLOGY AND APPLICATIONS SYMPOSIUM (RTAS)}, author={Sung, Hsin-Hsuan and Xu, Yuanchao and Guan, Jiexiong and Niu, Wei and Ren, Bin and Wang, Yanzhi and Liu, Shaoshan and Shen, Xipeng}, year={2022}, pages={297–300} } @article{zhao_niu_yuan_cai_sung_liu_liu_shen_ren_wang_et al._2021, title={Brief Industry Paper: Towards Real-Time 3D Object Detection for Autonomous Vehicles with Pruning Search}, ISSN={["1545-3421"]}, DOI={10.1109/RTAS52030.2021.00043}, abstractNote={In autonomous driving, 3D object detection is es-sential as it provides basic knowledge about the environment. However, as deep learning based 3D detection methods are usually computation intensive, it is challenging to support realtime 3D object detection on edge-computing devices in selfdriving cars with limited computation and memory resources. To facilitate this, we propose a compiler-aware pruning search framework, to achieve real-time inference of 3D object detection on the resource-limited mobile devices. Specifically, a generator is applied to sample better pruning proposals in the search space based on current proposals with their performance, and an evaluator is adopted to evaluate the sampled pruning proposal performance. To accelerate the search, the evaluator employs Bayesian optimization with an ensemble of neural predictors. We demonstrate in experiments that for the first time, the pruning search framework can achieve real-time 3D object detection on mobile (Samsung Galaxy S20 phone) with state-of-the-art detection performance.}, journal={2021 IEEE 27TH REAL-TIME AND EMBEDDED TECHNOLOGY AND APPLICATIONS SYMPOSIUM (RTAS 2021)}, author={Zhao, Pu and Niu, Wei and Yuan, Geng and Cai, Yuxuan and Sung, Hsin-Hsuan and Liu, Shaoshan and Liu, Sijia and Shen, Xipeng and Ren, Bin and Wang, Yanzhi and et al.}, year={2021}, pages={425–428} }