@inproceedings{hao_jain_wijngaart_saxena_fan_liu_2023, title={DrGPU: A Top-Down Profiler for GPU Applications}, url={https://doi.org/10.1145/3578244.3583736}, DOI={10.1145/3578244.3583736}, abstractNote={GPUs have become common in HPC systems to accelerate scientific computing and machine learning applications. Efficiently mapping these applications to rapid evolutions of GPU architectures for high performance is a well-known challenge. Various performance inefficiencies exist in GPU kernels that impede applications from obtaining bare-metal performance. While existing tools are able to measure these inefficiencies, they mostly focus on data collection and presentation, requiring significant manual efforts to understand the root causes for actionable optimization. Thus, we develop DrGPU, a novel profiler that performs top-down analysis to guide GPU code optimization. As its salient feature, DrGPU leverages hardware performance counters available in commodity GPUs to quantify stall cycles, decompose them into various stall reasons, pinpoint root causes, and provide intuitive optimization guidance. With the help of DrGPU, we are able to analyze important GPU benchmarks and applications and obtain nontrivial speedups --- up to 1.77X on V100 and 2.03X on GTX 1650.}, author={Hao, Yueming and Jain, Nikhil and Wijngaart, Rob Van and Saxena, Nirmal and Fan, Yuanbo and Liu, Xu}, year={2023}, month={Apr} } @article{zhou_hao_mellor-crummey_meng_liu_2022, title={VALUEEXPERT: Exploring Value Patterns in GPU-Accelerated Applications}, url={http://dx.doi.org/10.1145/3503222.3507708}, DOI={10.1145/3503222.3507708}, abstractNote={General-purpose GPUs have become common in modern computing systems to accelerate applications in many domains, including machine learning, high-performance computing, and autonomous driving. However, inefficiencies abound in GPU-accelerated applications, which prevent them from obtaining bare-metal performance. Performance tools play an important role in understanding performance inefficiencies in complex code bases. Many GPU performance tools pinpoint time-consuming code and provide high-level performance insights but overlook one important performance issue---value-related inefficiencies, which exist in many GPU code bases. In this paper, we present ValueExpert, a novel tool to pinpoint value-related inefficiencies in GPU applications. ValueExpert monitors application execution to capture values produced and used by each load and store operation in GPU kernels, recognizes multiple value patterns, and provides intuitive optimization guidance. We address systemic challenges in collecting, maintaining, and analyzing voluminous performance data from many GPU threads to make ValueExpert applicable to complex applications. We evaluate ValueExpert on a wide range of well-tuned benchmarks and applications, including PyTorch, Darknet, LAMMPS, Castro, and many others. ValueExpert is able to identify previously unknown performance issues and provide suggestions for nontrivial performance improvements with typically less than five lines of code changes. We verify our optimizations with application developers and upstream fixes to their repositories.}, journal={ASPLOS '22: PROCEEDINGS OF THE 27TH ACM INTERNATIONAL CONFERENCE ON ARCHITECTURAL SUPPORT FOR PROGRAMMING LANGUAGES AND OPERATING SYSTEMS}, publisher={ACM}, author={Zhou, Keren and Hao, Yueming and Mellor-Crummey, John and Meng, Xiaozhu and Liu, Xu}, year={2022}, pages={171–185} } @article{zhou_hao_mellor-crummey_meng_liu_2020, title={GVPRoF: A Value Profiler for GPU-Based Clusters}, volume={2020-November}, url={http://www.scopus.com/inward/record.url?eid=2-s2.0-85102360712&partnerID=MN8TOARS}, DOI={10.1109/SC41405.2020.00093}, abstractNote={GPGPUs are widely used in high-performance computing systems to accelerate scientific and machine learning workloads. Developing efficient GPU kernels is critically important to obtain “bare-metal” performance on GPU-based clusters. In this paper, we describe the design and implementation of GVPROF, the first value profiler that pinpoints value-related inefficiencies in applications running on NVIDIA GPU-based clusters. The novelty of GVPROF resides in its ability to detect temporal and spatial value redundancies, which provides useful information to guide code optimization. GVPROF can monitor production multi-node multi-GPU executions in clusters. Our experiments with well-known GPU benchmarks and HPC applications show that GVPROF incurs acceptable overhead and scales to large executions. Using GVPROF, we optimized several HPC and machine learning workloads on one NVIDIA V100 GPU. In one case study of LAMMPS, optimizations based on information from GVProf led to whole-program speedups ranging from 1.37x on a single GPU to 1.08x on 64 GPUs.}, journal={PROCEEDINGS OF SC20: THE INTERNATIONAL CONFERENCE FOR HIGH PERFORMANCE COMPUTING, NETWORKING, STORAGE AND ANALYSIS (SC20)}, author={Zhou, Keren and Hao, Yueming and Mellor-Crummey, John and Meng, Xiaozhu and Liu, Xu}, year={2020} }