@article{gu_beata_becchi_2020, title={A Loop-aware Autotuner for High-Precision Floating-point Applications}, DOI={10.1109/ISPASS48437.2020.00048}, abstractNote={Many scientific applications (e.g., molecular dynamics, climate modeling and astrophysical simulations) rely on floating-point arithmetic. Due to its approximate nature, the use of floating-point arithmetic can lead to inaccuracy and reproducibility issues, which can be particularly significant for long running applications. Indeed, previous work has shown that 64- bit IEEE floating-point arithmetic can be insufficient for many algorithms and applications, such as ill-conditioned linear systems, large summations, long-time or large-scale physical simulations, and experimental mathematics applications. To overcome these issues, existing work has proposed high-precision floating-point libraries (e.g., the GNU multiple precision arithmetic library), but these libraries come at the cost of significant execution time. In this work, we propose an auto-tuner for applications requiring high-precision floating-point arithmetic to deliver a prescribed level of accuracy. Our auto-tuner uses compiler analysis to discriminate operations and variables that require high-precision from those that can be handled using standard IEEE 64-bit floating-point arithmetic, and it generates a mixed precision program that trades off performance and accuracy by selectively using different precisions for different variables and operations. In particular, our auto-tuner leverages loop and data dependences analysis to quickly identify precision-sensitive variables and operations and provide results that are robust to different input datasets. We test our auto-tuner on a mix of applications with different computational patterns.}, journal={2020 IEEE INTERNATIONAL SYMPOSIUM ON PERFORMANCE ANALYSIS OF SYSTEMS AND SOFTWARE (ISPASS)}, author={Gu, Ruidong and Beata, Paul and Becchi, Michela}, year={2020}, pages={285–295} } @article{gu_becchi_2020, title={GPU-FPtuner: Mixed-precision Auto-tuning for Floating-point Applications on GPU}, ISSN={["1094-7256"]}, DOI={10.1109/HiPC50609.2020.00043}, abstractNote={GPUs have been extensively used to accelerate scientific applications from a variety of domains: computational fluid dynamics, astronomy and astrophysics, climate modeling, numerical analysis, to name a few. Many of these applications rely on floating-point arithmetic, which is approximate in nature. High-precision libraries have been proposed to mitigate accuracy issues due to the use of floating-point arithmetic. However, these libraries offer increased accuracy at a significant performance cost. Previous work, primarily focusing on CPU code and on standard IEEE floating-point data types, has explored mixed precision as a compromise between performance and accuracy. In this work, we propose a mixed precision autotuner for GPU applications that rely on floating-point arithmetic. Our tool supports standard 32- and 64-bit floating-point arithmetic, as well as high precision through the QD library. Our autotuner relies on compiler analysis to reduce the size of the tuning space. In particular, our tuning strategy takes into account code patterns prone to error propagation and GPU-specific considerations to generate a tuning plan that balances performance and accuracy. Our autotuner pipeline, implemented using the ROSE compiler and Python scripts, is fully automated and the code is available in open source. Our experimental results collected on benchmark applications with various code complexities show performance-accuracy tradeoffs for these applications and the effectiveness of our tool in identifying representative tuning points.}, journal={2020 IEEE 27TH INTERNATIONAL CONFERENCE ON HIGH PERFORMANCE COMPUTING, DATA, AND ANALYTICS (HIPC 2020)}, author={Gu, Ruidong and Becchi, Michela}, year={2020}, pages={294–304} } @article{gu_becchi_2019, title={A Comparative Study of Parallel Programming Frameworks for Distributed GPU Applications}, DOI={10.1145/3310273.3323071}, abstractNote={Parallel programming frameworks such as MPI, OpenSHMEM, Charm++ and Legion have been widely used in many scientific domains (from bioinformatics, to computational physics, chemistry, among others) to implement distributed applications. While they have the same purpose, these frameworks differ in terms of programmability, performance, and scalability under different applications and cluster types. Hence, it is important for programmers to select the programming framework that is best suited to the characteristics of their application types (i.e. its computation and communication patterns) and the hardware setup of the target high-performance computing cluster. In this work, we consider several popular parallel programming frameworks for distributed applications. We first analyze their memory model, execution model, synchronization model and GPU support. We then compare their programmability, performance, scalability, and load-balancing capability on homogeneous computing cluster equipped with GPUs.}, journal={CF '19 - PROCEEDINGS OF THE 16TH ACM INTERNATIONAL CONFERENCE ON COMPUTING FRONTIERS}, author={Gu, Ruidong and Becchi, Michela}, year={2019}, pages={268–273} } @inproceedings{surineni_gu_nguyen_becchi_2017, title={Understanding the performance-accuracy tradeoffs of floating-point arithmetic on GPUs}, DOI={10.1109/iiswc.2017.8167778}, abstractNote={Floating-point computations produce approximate results, possibly leading to inaccuracy and reproducibility problems. Existing work addresses two issues: first, the design of high precision floating-point representations; second, the study of methods to trade off accuracy and performance of CPU applications. However, a comprehensive study of the tradeoffs between accuracy and performance on modern GPUs is missing. This study covers the use of different floating-point precisions (i.e., single and double floating-point precision in IEEE 754 standard, GNU Multiple Precision, and composite floating-point precision) on GPU using a variety of synthetic and real-world benchmark applications. First, we analyze the support for single and double precision floating-point arithmetic on different GPU architectures, and we characterize the latencies of all floating-point instructions on GPU. Second, we study the performance/accuracy tradeoffs related to the use of different arithmetic precisions on addition, multiplication, division, and natural exponential function. Third, we analyze the combined use of different arithmetic operations on three benchmark applications characterized by different instruction mixes and arithmetic intensities. As a result of this analysis, we provide insights to guide users to the selection of the arithmetic precision leading to a good performance/accuracy tradeoff depending on the arithmetic operations and mathematical functions used in their program and the degree of multithreading of the code.}, booktitle={Proceedings of the 2017 ieee international symposium on workload characterization (iiswc)}, author={Surineni, S. and Gu, R. D. and Nguyen, H. and Becchi, M.}, year={2017}, pages={207–218} }