@article{shah_yu_di_lykov_alexeev_becchi_cappello_2023, title={GPU-Accelerated Error-Bounded Compression Framework for Quantum Circuit Simulations}, ISSN={["1530-2075"]}, DOI={10.1109/IPDPS54959.2023.00081}, abstractNote={Quantum circuit simulations enable researchers to develop quantum algorithms without the need for a physical quantum computer. Quantum computing simulators, however, all suffer from significant memory footprint requirements, which prevents large circuits from being simulated on classical super-computers. In this paper, we explore different lossy compression strategies to substantially shrink quantum circuit tensors in the QTensor package (a state-of-the-art tensor network quantum circuit simulator) while ensuring the reconstructed data satisfy the user-needed fidelity.Our contribution is fourfold. (1) We propose a series of optimized pre- and post-processing steps to boost the compression ratio of tensors with a very limited performance overhead. (2) We characterize the impact of lossy decompressed data on quantum circuit simulation results, and leverage the analysis to ensure the fidelity of reconstructed data. (3) We propose a configurable compression framework for GPU based on cuSZ and cuSZx, two state-of-the-art GPU-accelerated lossy compressors, to address different use-cases: either prioritizing compression ratios or prioritizing compression speed. (4) We perform a comprehensive evaluation by running 9 state-of-the-art compressors on an NVIDIA A100 GPU based on QTensor-generated tensors of varying sizes. When prioritizing compression ratio, our results show that our strategies can increase the compression ratio nearly 10 times compared to using only cuSZ. When prioritizing throughput, we can perform compression at the comparable speed as cuSZx while achieving 3-4× higher compression ratios. Decompressed tensors can be used in QTensor circuit simulation to yield a final energy result within 1-5% of the true energy value.}, journal={2023 IEEE INTERNATIONAL PARALLEL AND DISTRIBUTED PROCESSING SYMPOSIUM, IPDPS}, author={Shah, Milan and Yu, Xiaodong and Di, Sheng and Lykov, Danylo and Alexeev, Yuri and Becchi, Michela and Cappello, Franck}, year={2023}, pages={757–767} } @article{shah_yu_di_becchi_cappello_2023, title={Lightweight Huffman Coding for Efficient GPU Compression}, DOI={10.1145/3577193.3593736}, abstractNote={Lossy compression is often deployed in scientific applications to reduce data footprint and improve data transfers and I/O performance. Especially for applications requiring on-the-flight compression, it is essential to minimize compression's runtime. In this paper, we design a scheme to improve the performance of cuSZ, a GPU-based lossy compressor. We observe that Huffman coding - used by cuSZ to compress metadata generated during compression - incurs a performance overhead that can be significant, especially for smaller datasets. Our work seeks to reduce the Huffman coding runtime with minimal-to-no impact on cuSZ's compression efficiency. Our contributions are as follows. First, we examine a variety of probability distributions to determine which distributions closely model the input to cuSZ's Huffman coding stage. From these distributions, we create a dictionary of pre-computed codebooks such that during compression, a codebook is selected from the dictionary instead of computing a custom codebook. Second, we explore three codebook selection criteria to be applied at runtime. Finally, we evaluate our scheme on real-world datasets and in the context of two important application use cases, HDF5 and MPI, using an NVIDIA A100 GPU. Our evaluation shows that our method can reduce the Huffman coding penalty by a factor of 78--92×, translating to a total speedup of up to 5× over baseline cuSZ. Smaller HDF5 chunk sizes enjoy over an 8× speedup in compression and MPI messages on the scale of tens of MB have a 1.4--30.5× speedup in communication time.}, journal={PROCEEDINGS OF THE 37TH INTERNATIONAL CONFERENCE ON SUPERCOMPUTING, ACM ICS 2023}, author={Shah, Milan and Yu, Xiaodong and Di, Sheng and Becchi, Michela and Cappello, Franck}, year={2023}, pages={99–110} } @article{shah_neff_wu_minutoli_tumeo_becchi_2022, title={Accelerating Random Forest Classification on GPU and FPGA}, ISSN={["0190-3918"]}, DOI={10.1145/3545008.3545067}, abstractNote={Random Forests (RFs) are a commonly used machine learning method for classification and regression tasks spanning a variety of application domains, including bioinformatics, business analytics, and software optimization. While prior work has focused primarily on improving performance of the training of RFs, many applications, such as malware identification, cancer prediction, and banking fraud detection, require fast RF classification. In this work, we accelerate RF classification on GPU and FPGA. In order to provide efficient support for large datasets, we propose a hierarchical memory layout suitable to the GPU/FPGA memory hierarchy. We design three RF classification code variants based on that layout, and we investigate GPU- and FPGA-specific considerations for these kernels. Our experimental evaluation, performed on an Nvidia Xp GPU and on a Xilinx Alveo U250 FPGA accelerator card using publicly available datasets on the scale of millions of samples and tens of features, covers various aspects. First, we evaluate the performance benefits of our hierarchical data structure over the standard compressed sparse row (CSR) format. Second, we compare our GPU implementation with cuML, a machine learning library targeting Nvidia GPUs. Third, we explore the performance/accuracy tradeoff resulting from the use of different tree depths in the RF. Finally, we perform a comparative performance analysis of our GPU and FPGA implementations. Our evaluation shows that, while reporting the best performance on GPU, our code variants outperform the CSR baseline both on GPU and FPGA. For high accuracy targets, our GPU implementation yields a 5-9 × speedup over CSR, and up to a 2 × speedup over Nvidia’s cuML library.}, journal={51ST INTERNATIONAL CONFERENCE ON PARALLEL PROCESSING, ICPP 2022}, author={Shah, Milan and Neff, Reece and Wu, Hancheng and Minutoli, Marco and Tumeo, Antonino and Becchi, Michela}, year={2022} }