@article{ravi_byna_koziol_tang_becchi_2023, title={Evaluating Asynchronous Parallel I/O on HPC Systems}, ISSN={["1530-2075"]}, DOI={10.1109/IPDPS54959.2023.00030}, abstractNote={Parallel I/O is an effective method to optimize data movement between memory and storage for many scientific applications. Poor performance of traditional disk-based file systems has led to the design of I/O libraries which take advantage of faster memory layers, such as on-node memory, present in high-performance computing (HPC) systems. By allowing caching and prefetching of data for applications alternating computation and I/O phases, a faster memory layer also provides opportunities for hiding the latency of I/O phases by overlapping them with computation phases, a technique called asynchronous I/O. Since asynchronous parallel I/O in HPC systems is still in the initial stages of development, there hasn't been a systematic study of the factors affecting its performance.In this paper, we perform a systematic study of various factors affecting the performance and efficacy of asynchronous I/O, we develop a performance model to estimate the aggregate I/O bandwidth achievable by iterative applications using synchronous and asynchronous I/O based on past observations, and we evaluate the performance of the recently developed asynchronous I/O feature of a parallel I/O library (HDF5) using benchmarks and real-world science applications. Our study covers parallel file systems on two large-scale HPC systems: Summit and Cori, the former with a GPFS storage and the latter with a Lustre parallel file system.}, journal={2023 IEEE INTERNATIONAL PARALLEL AND DISTRIBUTED PROCESSING SYMPOSIUM, IPDPS}, author={Ravi, John and Byna, Suren and Koziol, Quincey and Tang, Houjun and Becchi, Michela}, year={2023}, pages={211–221} } @article{ravi_byna_becchi_2023, title={Runway: In-transit Data Compression on Heterogeneous HPC Systems}, DOI={10.1109/CCGRID57682.2023.00030}, abstractNote={To alleviate bottlenecks in storing and accessing data on high-performance computing (HPC) systems, I/O libraries are enabling computation while data is in-transit, such as HDFS filters. For scientific applications that commonly use floating-point data, error-bounded lossy compression methods are a critical technique to significantly reduce the storage and bandwidth requirements. Thus far, deciding when and where to schedule in-transit data transformations, such as compression, has been outside the scope of I/O libraries. In this paper, we introduce Runway, a runtime framework that enables computation on in-transit data with an object storage abstraction. Runway is designed to be extensible to execute user-defined functions at runtime. In this effort, we focus on studying methods to offload data compression operations to available processing units based on latency and throughput. We compare the performance of running compression on multi-core CPUs, as well as offloading it to a GPU and a Data Processing Unit (DPU). We implement a state-of-the-art error-bounded lossy compression algorithm, SZ3, as a Runway function with a variant optimized for DPUs. We propose dynamic modeling to guide scheduling decisions for in-transit data compression. We evaluate Runway using four scientific datasets from the SDRBench benchmark suite on a the Perlmutter supercomputer at NERSC.}, journal={2023 IEEE/ACM 23RD INTERNATIONAL SYMPOSIUM ON CLUSTER, CLOUD AND INTERNET COMPUTING, CCGRID}, author={Ravi, John and Byna, Suren and Becchi, Michela}, year={2023}, pages={229–239} } @article{ravi_byna_becchi_2023, title={Runway: In-transit Data Compression on Heterogeneous HPC Systems}, DOI={10.1109/CCGridW59191.2023.00078}, abstractNote={To alleviate bottlenecks in storing and accessing data on high-performance computing (HPC) systems, I/O libraries are enabling computation while data is in-transit, such as HDF5 filters. For scientific applications that commonly use floating-point data, error-bounded lossy compression methods are a critical technique to significantly reduce the storage and bandwidth requirements. Thus far, deciding when and where to schedule in-transit data transformations, such as compression, has been outside the scope of I/O libraries.In this paper, we introduce Runway, a runtime framework that enables computation on in-transit data with an object storage abstraction. Runway is designed to be extensible to execute user-defined functions at runtime. In this effort, we focus on studying methods to offload data compression operations to available processing units based on latency and throughput. We compare the performance of running compression on multi-core CPUs, as well as offloading it to a GPU and a Data Processing Unit (DPU). We implement a state-of-the-art error-bounded lossy compression algorithm, SZ3, as a Runway function with a variant optimized for DPUs. We propose dynamic modeling to guide scheduling decisions for in-transit data compression.}, journal={2023 IEEE/ACM 23RD INTERNATIONAL SYMPOSIUM ON CLUSTER, CLOUD AND INTERNET COMPUTING WORKSHOPS, CCGRIDW}, author={Ravi, John and Byna, Suren and Becchi, Michela}, year={2023}, pages={340–342} } @article{zheng_vishwanath_koziol_tang_ravi_mainzer_byna_2022, title={HDF5 Cache VOL: Efficient and Scalable Parallel I/O through Caching Data on Node-local Storage}, DOI={10.1109/CCGrid54584.2022.00015}, abstractNote={Modern-era high performance computing (HPC) systems are providing multiple levels of memory and storage layers to bridge the performance gap between fast memory and slow disk-based storage system managed by Lustre or GPFS. Several of the recent HPC systems are equipped with SSD and NVMe-based storage that is attached locally to compute nodes. A few systems are providing an SSD-based “burst buffer” intermediate storage layer that is accessible by all compute nodes as a single file system. Although these hardware layers are intended to reduce the latency gap between memory and disk-based long-term storage, how to utilize them has been left to the users. High-level I/O libraries, such as HDF5 and netCDF, can potentially take advantage of the node-local storage as a cache for reducing I/O latency from capacity storage. However, it is challenging to use node-local storage in parallel I/O especially for a single shared file. In this paper, we present an approach to integrate node-local storage as transparent caching or staging layers in a high-level parallel I/O library without placing the burden of managing these layers on users. We designed this to move data asynchronously between the caching storage layer and a parallel file system to overlap the data movement overhead in performing I/O with compute phases. We implement this approach as an external HDF5 Virtual Object Layer (VOL) connector, named Cache VOL. HDF5 VOL is a layer of abstraction in HDF5 that allows intercepting the public HDF5 application programming interface (API) and performing various optimizations to data movement after the interception. Existing HDF5 applications can use Cache VOL with minimal code modifications. We evaluated the performance of Cache VOL in HPC applications such as VPIC-10, and deep learning applications such as ImageNet and CosmoFlow. We show that using Cache VOL, one can achieve higher observed I/O performance, more scalable and stable I/O compared to direct I/O to the parallel file system, thus achieving faster time-to-solution in scientific simulations. While the caching approach is implemented in HDF5, the methods are applicable in other high-level I/O libraries.}, journal={2022 22ND IEEE/ACM INTERNATIONAL SYMPOSIUM ON CLUSTER, CLOUD AND INTERNET COMPUTING (CCGRID 2022)}, author={Zheng, Huihuo and Vishwanath, Venkatram and Koziol, Quincey and Tang, Houjun and Ravi, John and Mainzer, John and Byna, Suren}, year={2022}, pages={61–70} } @article{tang_koziol_ravi_byna_2022, title={Transparent Asynchronous Parallel I/O Using Background Threads}, volume={33}, ISSN={["1558-2183"]}, DOI={10.1109/TPDS.2021.3090322}, abstractNote={Moving toward exascale computing, the size of data stored and accessed by applications is ever increasing. However, traditional disk-based storage has not seen improvements that keep up with the explosion of data volume or the speed of processors. Multiple levels of non-volatile storage devices are being added to handle bursty I/O, however, moving data across the storage hierarchy can take longer than the data generation or analysis. Asynchronous I/O can reduce the impact of I/O latency as it allows applications to schedule I/O early and to check their status later. I/O is thus overlapped with application communication or computation or both, effectively hiding some or all of the I/O latency. POSIX and MPI-I/O provide asynchronous read and write operations, but lack the support for non-data operations such as file open and close. Users also have to manually manage data dependencies and use low-level byte offsets, which requires significant effort and expertise to adopt. In this article, we present an asynchronous I/O framework that supports all types of I/O operations, manages data dependencies transparently and automatically, provides implicit and explicit modes for application flexibility, and error information retrieval. We implemented these techniques in HDF5. Our evaluation of several benchmarks and application workloads demonstrates it effectiveness on hiding the I/O cost from the application.}, number={4}, journal={IEEE TRANSACTIONS ON PARALLEL AND DISTRIBUTED SYSTEMS}, author={Tang, Houjun and Koziol, Quincey and Ravi, John and Byna, Suren}, year={2022}, month={Apr}, pages={891–902} } @article{bellingham-johnstun_anders_ravi_bruinsma_laplante_2021, title={Molecular organization of cytokinesis node predicts the constriction rate of the contractile ring}, volume={220}, ISSN={["1540-8140"]}, DOI={10.1083/jcb.202008032}, abstractNote={The molecular organization of cytokinesis proteins governs contractile ring function. We used single molecule localization microscopy in live cells to elucidate the molecular organization of cytokinesis proteins and relate it to the constriction rate of the contractile ring. Wild-type fission yeast cells assemble contractile rings by the coalescence of cortical proteins complexes called nodes whereas cells without Anillin/Mid1p (Δmid1) lack visible nodes yet assemble contractile rings competent for constriction from the looping of strands. We leveraged the Δmid1 contractile ring assembly mechanism to determine how two distinct molecular organizations, nodes versus strands, can yield functional contractile rings. Contrary to previous interpretations, nodes assemble in Δmid1 cells. Our results suggest that Myo2p heads condense upon interaction with actin filaments and an excess number of Myo2p heads bound to actin filaments hinders constriction thus reducing the constriction rate. Our work establishes a predictive correlation between the molecular organization of nodes and the behavior of the contractile ring.}, number={3}, journal={JOURNAL OF CELL BIOLOGY}, author={Bellingham-Johnstun, Kimberly and Anders, Erica Casey and Ravi, John and Bruinsma, Christina and Laplante, Caroline}, year={2021}, month={Mar} } @article{ravi_nguyen_zhou_becchi_2021, title={PILOT: a Runtime System to Manage Multi-tenant GPU Unified Memory Footprint}, ISSN={["1094-7256"]}, DOI={10.1109/HiPC53243.2021.00063}, abstractNote={Concurrent kernel execution on GPU has proven an effective technique to improve system throughput by maximizing the resource utilization. In order to increase programmability and meet the increasing memory requirements of data-intensive applications, current GPUs support Unified Virtual Memory (UVM), which provides a virtual memory abstraction with demand paging. By allowing applications to oversubscribe GPU memory, UVM provides increased opportunities to share GPU resources across applications. However, in the presence of applications with competing memory requirements, GPU sharing can lead to performance degradation due to thrashing. NVIDIA's Multiple Process Service (MPS) offers the capability to space share bare metal GPUs, thereby enabling cluster workload managers, such as Slurm, to share a single GPU across MPI ranks with limited control over resource partitioning. However, it is not possible to preempt, schedule, or throttle a running GPU process through MPS. These features would enable new OS-managed scheduling policies to be implemented for GPU kernels to dynamically handle resource contention and offer consistent performance. The contribution of this paper is two-fold. We first show how memory oversubscription can impact the performance of concurrent GPU applications. Then, we propose three methods to transparently mitigate memory interference through kernel preemption and scheduling policies. To implement our policies, we develop our own runtime system (PILOT) to serve as an alternative to NVIDIA's MPS. In the presence of memory over-subscription, we noticed a dramatic improvement in the overall throughput when using our scheduling policies and runtime hints.}, journal={2021 IEEE 28TH INTERNATIONAL CONFERENCE ON HIGH PERFORMANCE COMPUTING, DATA, AND ANALYTICS (HIPC 2021)}, author={Ravi, John and Nguyen, Tri and Zhou, Huiyang and Becchi, Michela}, year={2021}, pages={442–447} } @article{ravi_byna_koziol_2020, title={GPU Direct I/O with HDF5}, DOI={10.1109/PDSW51947.2020.00010}, abstractNote={Exascale HPC systems are being designed with accelerators, such as GPUs, to accelerate parts of applications. In machine learning workloads as well as large-scale simulations that use GPUs as accelerators, the CPU (or host) memory is currently used as a buffer for data transfers between GPU (or device) memory and the file system. If the CPU does not need to operate on the data, then this is sub-optimal because it wastes host memory by reserving space for duplicated data. Furthermore, this “bounce buffer” approach wastes CPU cycles spent on transferring data. A new technique, NVIDIA GPUDirect Storage (GDS), can eliminate the need to use the host memory as a bounce buffer. Thereby, it becomes possible to transfer data directly between the device memory and the file system. This direct data path shortens latency by omitting the extra copy and enables higher-bandwidth. To take full advantage of GDS in existing applications, it is necessary to provide support with existing I/O libraries, such as HDF5 and MPI-IO, which are heavily used in applications. In this paper, we describe our effort of integrating GDS with HDF5, the top I/O library at NERSC and at DOE leadership computing facilities. We design and implement this integration using a HDF5 Virtual File Driver (VFD). The GDS VFD provides a file system abstraction to the application that allows HDF5 applications to perform I/O without the need to move data between CPUs and GPUs explicitly. We compare performance of the HDF5 GDS VFD with explicit data movement approaches and demonstrate superior performance with the GDS method.}, journal={PROCEEDINGS OF 2020 IEEE/ACM FIFTH INTERNATIONAL PARALLEL DATA SYSTEMS WORKSHOP (PDSW 2020)}, author={Ravi, John and Byna, Suren and Koziol, Quincey}, year={2020}, pages={28–33} } @article{wu_ravi_becchi_2018, title={Compiling SIMT Programs on Multi- and Many-core Processors with Wide Vector Units: A Case Study with CUDA}, ISSN={["1094-7256"]}, DOI={10.1109/HiPC.2018.00022}, abstractNote={Manycore processors and coprocessors with wide vector extensions, such as Intel Phi and Skylake devices, have become popular due to their high throughput capability. Performance optimization on these devices requires using both their x86-compatible cores and their vector units. While the x86-compatible cores can be programmed using traditional programming interfaces following the MIMD model, such as POSIX threads, MPI and OpenMP, the SIMD vector units are harder to program. The Intel software stack provides two approaches for code vectorization: automatic vectorization through the Intel compiler and manual vectorization through vector intrinsics. While the Intel compiler often fails to vectorize code with complex control flows and function calls, the manual approach is error-prone and leads to less portable code. Hence, there has been an increasing interest in SIMT programming tools allowing the simultaneous use of x86 cores and vector units while providing programmability and code portability. However, the effective implementation of the SIMT model on these hybrid architectures is not well understood. In this work, we target this problem. First, we propose a set of compiler techniques to transform programs written using a SIMT programming model (a subset of CUDA C) into code that leverages both the x86 cores and the vector units of a hybrid MIMD/SIMD architecture, thus providing programmability, high system utilization and performance. Second, we evaluate the proposed techniques on Xeon Phi and Skylake processors using micro-benchmarks and real-world applications. Third, we compare the resulting performance with that achieved by the same code on GPUs. Based on this analysis, we point out the main challenges in supporting the SIMT model on hybrid MIMD/SIMD architectures, while providing performance comparable to that of SIMT systems (e.g., GPUs).}, journal={2018 IEEE 25TH INTERNATIONAL CONFERENCE ON HIGH PERFORMANCE COMPUTING (HIPC)}, author={Wu, Hancheng and Ravi, John and Becchi, Michela}, year={2018}, pages={123–132} }