@article{zhao_chabbi_liu_2024, title={EASYVIEW: Bringing Performance Profiles into Integrated Development Environments}, ISSN={["2164-2397"]}, DOI={10.1109/CGO57630.2024.10444840}, journal={2024 IEEE/ACM INTERNATIONAL SYMPOSIUM ON CODE GENERATION AND OPTIMIZATION, CGO}, author={Zhao, Qidong and Chabbi, Milind and Liu, Xu}, year={2024}, pages={386–398} } @article{li_su_chabbi_jiao_liu_2023, title={DJXPerf: Identifying Memory Inefficiencies via Object-Centric Profiling for Java}, ISSN={["2164-2397"]}, DOI={10.1145/3579990.3580010}, abstractNote={Java is the “go-to” programming language choice for developing scalable enterprise cloud applications. In such systems, even a few percent CPU time savings can offer a significant competitive advantage and cost savings. Although performance tools abound for Java, those that focus on the data locality in the memory hierarchy are rare. In this paper, we first categorize data locality issues in Java programs. We then present DJXPerf, a lightweight, object-centric memory profiler for Java, which associates memory-hierarchy performance metrics (e.g., cache/TLB misses) with Java objects. DJXPerf uses statistical sampling of hardware performance monitoring counters to attribute metrics to not only source code locations but also Java objects. DJXPerf presents Java object allocation contexts combined with their usage contexts and presents them ordered by the poor locality behaviors. DJXPerf’s performance measurement, object attribution, and presentation techniques guide optimizing object allocation, layout, and access patterns. DJXPerf incurs only ~8.5% runtime overhead and ∼6% memory overhead on average, requiring no modifications to hardware, OS, Java virtual machine, or application source code, which makes it attractive to use in production. Guided by DJXPerf, we study and optimize a number of Java and Scala programs, including well-known benchmarks and real-world applications, and demonstrate significant speedups.}, journal={PROCEEDINGS OF THE 21ST ACM/IEEE INTERNATIONAL SYMPOSIUM ON CODE GENERATION AND OPTIMIZATION, CGO 2023}, author={Li, Bolun and Su, Pengfei and Chabbi, Milind and Jiao, Shuyin and Liu, Xu}, year={2023}, pages={81–94} } @article{li_zhao_jiao_liu_2023, title={DroidPerf: Profiling Memory Objects on Android Devices}, DOI={10.1145/3570361.3592503}, abstractNote={Optimizing performance inefficiencies in memory hierarchies is well-known for native languages, such as C and C++. There are few studies, however, on exploring memory inefficiencies in Android Runtime (ART). Running in ART, managed languages, such as Java and Kotlin, employ various abstractions, such as runtime support, ahead-of-time (AOT) compilation, and garbage collection (GC), which hide important execution details from the plain source code. In this paper, we develop DroidPerf, a lightweight, object-centric memory profiler for ART, which associates memory inefficiencies with objects created and used in Android apps. With such object-level information, DroidPerf is able to guide locality optimization on memory layouts, access patterns, and allocation patterns. Guided by DroidPerf, we optimize a number of popular Android apps and obtain significant performance gains. Many inefficiencies are confirmed by the code authors and optimization patches are under evaluation for upstreaming. As a practical tool, DroidPerf incurs ~32% runtime overhead and ~14% memory overhead on average. Furthermore, DroidPerf works in the production environment with off-the-shelf hardware, OS, Dalvik virtual machine, ART, and unmodified Android app source code.}, journal={PROCEEDINGS OF THE 29TH ANNUAL INTERNATIONAL CONFERENCE ON MOBILE COMPUTING AND NETWORKING, MOBICOM 2023}, author={Li, Bolun and Zhao, Qidong and Jiao, Shuyin and Liu, Xu}, year={2023}, pages={75–89} } @article{li_guo_luo_wang_wang_liu_2022, title={Graph Neural Networks Based Memory Inefficiency Detection Using Selective Sampling}, ISSN={["2167-4329"]}, DOI={10.1109/SC41404.2022.00090}, abstractNote={Production software of data centers oftentimes suffers from unnecessary memory inefficiencies caused by inappropriate use of data structures, conservative compiler optimizations, and so forth. Nevertheless, whole-program monitoring tools often incur incredibly high overhead due to fine-grained memory access instrumentation. Consequently, the fine-grained monitoring tools are not viable for long-running, large-scale data center applications due to strict latency criteria (e.g., service-level agreement or SLA). To this end, this work presents a novel learning-aided system, namely Puffin, to identify three kinds of unnecessary memory operations including dead stores, silent loads and silent stores, by applying gated graph neural networks onto fused static and dynamic program semantics with respect to relative positional embedding. To deploy the system in large-scale data centers, this work explores a sampling-based detection infrastructure with high efficacy and negligible overhead. We evaluate Puffin upon the well-known SPEC CPU 2017 benchmark suite for four compilation options. Experimental results show that the proposed method is able to capture the three kinds of memory inefficiencies with as high accuracy as 96% and a reduced checking overhead by $5.66\times$ over the state-of-the-art tool.}, journal={SC22: INTERNATIONAL CONFERENCE FOR HIGH PERFORMANCE COMPUTING, NETWORKING, STORAGE AND ANALYSIS}, author={Li, Pengcheng and Guo, Yixin and Luo, Yingwei and Wang, Xiaolin and Wang, Zhenlin and Liu, Xu}, year={2022} } @article{li_xu_zhao_su_chabbi_jiao_liu_2022, title={OJXPerf: Featherlight Object Replica Detection for Java Programs}, ISSN={["0270-5257"]}, DOI={10.1145/3510003.3510083}, abstractNote={Memory bloat is an important source of inefficiency in complex production software, especially in software written in managed languages such as Java. Prior approaches to this problem have focused on identifying objects that outlive their life span. Few studies have, however, looked into whether and to what extent myriad objects of the same type are identical. A quantitative assessment of identical objects with code-level attribution can assist developers in refactoring code to eliminate object bloat, and favor reuse of existing object(s). The result is reduced memory pressure, reduced allocation and garbage collection, enhanced data locality, and reduced re-computation, all of which result in superior performance. We develop OJXPerf, a lightweight sampling-based profiler, which probabilistically identifies identical objects. OJXPerf employs hardware performance monitoring units (PMU) in conjunction with hardware debug registers to sample and compare field values of different objects of the same type allocated at the same calling context but potentially accessed at different program points. The result is a lightweight measurement – a combination of object allocation contexts and usage contexts ordered by duplication frequency. This class of duplicated objects is relatively easier to optimize. OJXPerf incurs 9% runtime and 6% memory overheads on average. We empirically show the benefit of OJXPerf by using its profiles to instruct us to optimize a number of Java programs, including well-known benchmarks and real-world applications. The results show a noticeable reduction in memory usage (up to 11%) and a significant speedup (up to 25%).}, journal={2022 ACM/IEEE 44TH INTERNATIONAL CONFERENCE ON SOFTWARE ENGINEERING (ICSE 2022)}, author={Li, Bolun and Xu, Hao and Zhao, Qidong and Su, Pengfei and Chabbi, Milind and Jiao, Shuyin and Liu, Xu}, year={2022}, pages={1558–1570} } @article{zhou_hao_mellor-crummey_meng_liu_2022, title={VALUEEXPERT: Exploring Value Patterns in GPU-Accelerated Applications}, url={http://dx.doi.org/10.1145/3503222.3507708}, DOI={10.1145/3503222.3507708}, abstractNote={General-purpose GPUs have become common in modern computing systems to accelerate applications in many domains, including machine learning, high-performance computing, and autonomous driving. However, inefficiencies abound in GPU-accelerated applications, which prevent them from obtaining bare-metal performance. Performance tools play an important role in understanding performance inefficiencies in complex code bases. Many GPU performance tools pinpoint time-consuming code and provide high-level performance insights but overlook one important performance issue---value-related inefficiencies, which exist in many GPU code bases. In this paper, we present ValueExpert, a novel tool to pinpoint value-related inefficiencies in GPU applications. ValueExpert monitors application execution to capture values produced and used by each load and store operation in GPU kernels, recognizes multiple value patterns, and provides intuitive optimization guidance. We address systemic challenges in collecting, maintaining, and analyzing voluminous performance data from many GPU threads to make ValueExpert applicable to complex applications. We evaluate ValueExpert on a wide range of well-tuned benchmarks and applications, including PyTorch, Darknet, LAMMPS, Castro, and many others. ValueExpert is able to identify previously unknown performance issues and provide suggestions for nontrivial performance improvements with typically less than five lines of code changes. We verify our optimizations with application developers and upstream fixes to their repositories.}, journal={ASPLOS '22: PROCEEDINGS OF THE 27TH ACM INTERNATIONAL CONFERENCE ON ARCHITECTURAL SUPPORT FOR PROGRAMMING LANGUAGES AND OPERATING SYSTEMS}, publisher={ACM}, author={Zhou, Keren and Hao, Yueming and Mellor-Crummey, John and Meng, Xiaozhu and Liu, Xu}, year={2022}, pages={171–185} } @article{tan_chen_liu_ren_song_shen_liu_2021, title={Toward Efficient Interactions between Python and Native Libraries}, DOI={10.1145/3468264.3468541}, abstractNote={Python has become a popular programming language because of its excellent programmability. Many modern software packages utilize Python for high-level algorithm design and depend on native libraries written in C/C++/Fortran for efficient computation kernels. Interaction between Python code and native libraries introduces performance losses because of the abstraction lying on the boundary of Python and native libraries. On the one side, Python code, typically run with interpretation, is disjoint from its execution behavior. On the other side, native libraries do not include program semantics to understand algorithm defects. To understand the interaction inefficiencies, we extensively study a large collection of Python software packages and categorize them according to the root causes of inefficiencies. We extract two inefficiency patterns that are common in interaction inefficiencies. Based on these patterns, we develop PieProf, a lightweight profiler, to pinpoint interaction inefficiencies in Python applications. The principle of PieProf is to measure the inefficiencies in the native execution and associate inefficiencies with high-level Python code to provide a holistic view. Guided by PieProf, we optimize 17 real-world applications, yielding speedups up to 6.3× on application level.}, journal={PROCEEDINGS OF THE 29TH ACM JOINT MEETING ON EUROPEAN SOFTWARE ENGINEERING CONFERENCE AND SYMPOSIUM ON THE FOUNDATIONS OF SOFTWARE ENGINEERING (ESEC/FSE '21)}, author={Tan, Jialiang and Chen, Yu and Liu, Zhenming and Ren, Bin and Song, Shuaiwen Leon and Shen, Xipeng and Liu, Xu}, year={2021}, pages={1117–1128} } @article{zhao_liu_chabbi_2020, title={DRCCTPROF: A Fine-Grained Call Path Profiler for ARM-Based Clusters}, DOI={10.1109/SC41405.2020.00034}, abstractNote={ARM is an attractive CPU architecture for exascale systems because of its energy efficiency. As a recent entry into the HPC paradigm, ARM lags in its software stack, especially in the performance tooling aspect. Notably, there is a lack of fine-grained measurement tools to analyze fully optimized HPC binary executables on ARM processors. In this paper, we introduce DRCCTPROF — a fine-grained call path profiling framework for binaries running on ARM architectures. The unique ability of DRCCTPROF is to obtain full calling context at any and every machine instruction that executes, which provides more detailed diagnostic feedback for performance optimization and correctness tools. Furthermore, DRCCTPROF not only associates any instruction with source code along the call path, but also associates memory access instructions back to the constituent data object. Finally, DRCCTPROF incurs moderate overhead and provides a compact view to visualize the profiles collected from parallel executions.}, journal={PROCEEDINGS OF SC20: THE INTERNATIONAL CONFERENCE FOR HIGH PERFORMANCE COMPUTING, NETWORKING, STORAGE AND ANALYSIS (SC20)}, author={Zhao, Qidong and Liu, Xu and Chabbi, Milind}, year={2020} } @article{zhou_hao_mellor-crummey_meng_liu_2020, title={GVPRoF: A Value Profiler for GPU-Based Clusters}, volume={2020-November}, url={http://www.scopus.com/inward/record.url?eid=2-s2.0-85102360712&partnerID=MN8TOARS}, DOI={10.1109/SC41405.2020.00093}, abstractNote={GPGPUs are widely used in high-performance computing systems to accelerate scientific and machine learning workloads. Developing efficient GPU kernels is critically important to obtain “bare-metal” performance on GPU-based clusters. In this paper, we describe the design and implementation of GVPROF, the first value profiler that pinpoints value-related inefficiencies in applications running on NVIDIA GPU-based clusters. The novelty of GVPROF resides in its ability to detect temporal and spatial value redundancies, which provides useful information to guide code optimization. GVPROF can monitor production multi-node multi-GPU executions in clusters. Our experiments with well-known GPU benchmarks and HPC applications show that GVPROF incurs acceptable overhead and scales to large executions. Using GVPROF, we optimized several HPC and machine learning workloads on one NVIDIA V100 GPU. In one case study of LAMMPS, optimizations based on information from GVProf led to whole-program speedups ranging from 1.37x on a single GPU to 1.08x on 64 GPUs.}, journal={PROCEEDINGS OF SC20: THE INTERNATIONAL CONFERENCE FOR HIGH PERFORMANCE COMPUTING, NETWORKING, STORAGE AND ANALYSIS (SC20)}, author={Zhou, Keren and Hao, Yueming and Mellor-Crummey, John and Meng, Xiaozhu and Liu, Xu}, year={2020} } @article{jin_wang_yu_tang_hoefler_liu_zhai_2020, title={SCALANA: Automating Scaling Loss Detection with Graph Analysis}, DOI={10.1109/SC41405.2020.00032}, abstractNote={Scaling a parallel program to modern supercomputers is challenging due to inter-process communication, Amdahl’s law, and resource contention. Performance analysis tools for finding such scaling bottlenecks either base on profiling or tracing. Profiling incurs low overheads but does not capture detailed dependencies needed for root-cause analysis. Tracing collects all information at prohibitive overheads. In this work, we design SCALANA that uses static analysis techniques to achieve the best of both worlds - it enables the analyzability of traces at a cost similar to profiling. SCALANA first leverages static compiler techniques to build a Program Structure Graph, which records the main computation and communication patterns as well as the program’s control structures. At runtime, we adopt lightweight techniques to collect performance data according to the graph structure and generate a Program Performance Graph. With this graph, we propose a novel approach, called backtracking root cause detection, which can automatically and efficiently detect the root cause of scaling loss. We evaluate SCALANA with real applications. Results show that our approach can effectively locate the root cause of scaling loss for real applications and incurs 1.73parcent overhead on average for up to 2,048 processes. We achieve up to 11.11parcent performance improvement by fixing the root causes detected by SCALANA on 2,048 processes.}, journal={PROCEEDINGS OF SC20: THE INTERNATIONAL CONFERENCE FOR HIGH PERFORMANCE COMPUTING, NETWORKING, STORAGE AND ANALYSIS (SC20)}, author={Jin, Yuyang and Wang, Haojie and Yu, Teng and Tang, Xiongchao and Hoefler, Torsten and Liu, Xu and Zhai, Jidong}, year={2020} } @article{you_yang_luan_qian_liu_2020, title={ZeroSpy: Exploring Software Inefficiency with Redundant Zeros}, DOI={10.1109/SC41405.2020.00033}, abstractNote={Redundant zeros cause inefficiencies in which the zero values are loaded and computed repeatedly, resulting in unnecessary memory traffic and identity computation that waste memory bandwidth and CPU resources. optimizing compilers is difficult in eliminating these zero-related inefficiencies due to limitations in static analysis. Hardware approaches, in contrast, optimize inefficiencies without code modification, but are not widely adopted in commodity processors. In this paper, we propose ZeroSpy - a fine-grained profiler to identify redundant zeros caused by both inappropriate use of data structures and useless computation. ZeroSpy also provides intuitive optimization guidance by revealing the locations where the redundant zeros happen in source lines and calling contexts. The experimental results demonstrate ZeroSpy is capable of identifying redundant zeros in programs that have been highly optimized for years. Based on the optimization guidance revealed by ZeroSpy, we can achieve significant speedups after eliminating redundant zeros.}, journal={PROCEEDINGS OF SC20: THE INTERNATIONAL CONFERENCE FOR HIGH PERFORMANCE COMPUTING, NETWORKING, STORAGE AND ANALYSIS (SC20)}, author={You, Xin and Yang, Hailong and Luan, Zhongzhi and Qian, Depei and Liu, Xu}, year={2020} }