@article{zhang_tang_harenberg_byna_zou_devendran_martin_wu_dong_klasky_et al._2016, title={AMRZone: A Runtime AMR Data Sharing Framework For Scientific Applications}, ISSN={["2376-4414"]}, DOI={10.1109/ccgrid.2016.62}, abstractNote={Frameworks that facilitate runtime data sharingacross multiple applications are of great importance for scientificdata analytics. Although existing frameworks work well overuniform mesh data, they can not effectively handle adaptive meshrefinement (AMR) data. Among the challenges to construct anAMR-capable framework include: (1) designing an architecturethat facilitates online AMR data management, (2) achievinga load-balanced AMR data distribution for the data stagingspace at runtime, and (3) building an effective online indexto support the unique spatial data retrieval requirements forAMR data. Towards addressing these challenges to supportruntime AMR data sharing across scientific applications, wepresent the AMRZone framework. Experiments over real-worldAMR datasets demonstrate AMRZone's effectiveness at achievinga balanced workload distribution, reading/writing large-scaledatasets with thousands of parallel processes, and satisfyingqueries with spatial constraints. Moreover, AMRZone's performance and scalability are even comparable with existing state-of-the-art work when tested over uniform mesh data with up to16384 cores, in the best case, our framework achieves a 46% performance improvement.}, journal={2016 16TH IEEE/ACM INTERNATIONAL SYMPOSIUM ON CLUSTER, CLOUD AND GRID COMPUTING (CCGRID)}, author={Zhang, Wenzhao and Tang, Houjun and Harenberg, Steve and Byna, Surendra and Zou, Xiaocheng and Devendran, Dharshi and Martin, Daniel F. and Wu, Kesheng and Dong, Bin and Klasky, Scott and et al.}, year={2016}, pages={116–125} } @article{tang_byna_harenberg_zhang_zou_martin_dong_devendran_wu_trebotich_et al._2016, title={In situ Storage Layout Optimization for AMR Spatio-temporal Read Accesses}, ISSN={["0190-3918"]}, DOI={10.1109/icpp.2016.53}, abstractNote={Analyses of large simulation data often concentrate on regions in space and in time that contain important information. As simulations adopt Adaptive Mesh Refinement (AMR), the data records from a region of interest could be widely scattered on storage devices and accessing interesting regions results in significantly reduced I/O performance. In this work, we study the organization of block-structured AMR data on storage to improve performance of spatio-temporal data accesses. AMR has a complex hierarchical multi-resolution data structure that does not fit easily with the existing approaches that focus on uniform mesh data. To enable efficient AMR read accesses, we develop an in situ data layout optimization framework. Our framework automatically selects from a set of candidate layouts based on a performance model, and reorganizes the data before writing to storage. We evaluate this framework with three AMR datasets and access patterns derived from scientific applications. Our performance model is able to identify the best layout scheme and yields up to a 3X read performance improvement compared to the original layout. Though it is not possible to turn all read accesses into contiguous reads, we are able to achieve 90% of contiguous read throughput with the optimized layouts on average.}, journal={PROCEEDINGS 45TH INTERNATIONAL CONFERENCE ON PARALLEL PROCESSING - ICPP 2016}, author={Tang, Houjun and Byna, Suren and Harenberg, Steve and Zhang, Wenzhao and Zou, Xiaocheng and Martin, Daniel F. and Dong, Bin and Devendran, Dharshi and Wu, Kesheng and Trebotich, David and et al.}, year={2016}, pages={406–415} } @article{tang_byna_harenberg_zou_zhang_wu_dong_rubel_bouchard_klasky_et al._2016, title={Usage Pattern-Driven Dynamic Data Layout Reorganization}, ISSN={["2376-4414"]}, DOI={10.1109/ccgrid.2016.15}, abstractNote={As scientific simulations and experiments move toward extremely large scales and generate massive amounts of data, the data access performance of analytic applications becomes crucial. A mismatch often happens between write and read patterns of data accesses, typically resulting in poor read performance. Data layout reorganization has been used to improve the locality of data accesses. However, current data reorganizations are static and focus on generating a single (or set of) optimized layouts that rely on prior knowledge of exact future access patterns. We propose a framework that dynamically recognizes the data usage patterns, replicates the data of interest in multiple reorganized layouts that would benefit common read patterns, and makes runtime decisions on selecting a favorable layout for a given read pattern. This framework supports reading individual elements and chunks of a multi-dimensional array of variables. Our pattern-driven layout selection strategy achieves multi-fold speedups compared to reading from the original dataset.}, journal={2016 16TH IEEE/ACM INTERNATIONAL SYMPOSIUM ON CLUSTER, CLOUD AND GRID COMPUTING (CCGRID)}, author={Tang, Houjun and Byna, Suren and Harenberg, Steve and Zou, Xiaocheng and Zhang, Wenzhao and Wu, Kesheng and Dong, Bin and Rubel, Oliver and Bouchard, Kristofer and Klasky, Scott and et al.}, year={2016}, pages={356–365} } @article{zhang_tang_zou_harenberg_liu_klasky_samatova_2015, title={Exploring Memory Hierarchy to Improve Scientific Data Read Performance}, ISSN={["1552-5244"]}, DOI={10.1109/cluster.2015.18}, abstractNote={Improving read performance is one of the major challenges with speeding up scientific data analytic applications. Utilizing the memory hierarchy is one major line of researches to address the read performance bottleneck. Related methods usually combine solide-state-drives(SSDs) with dynamic random-access memory(DRAM) and/or parallel file system(PFS) to mitigate the speed and space gap between DRAM and PFS. However, these methods are unable to handle key performance issues plaguing SSDs, namely read contention that may cause up to 50% performance reduction. In this paper, we propose a framework that exploits the memory hierarchy resource to address the read contention issues involved with SSDs. The framework employs a general purpose online read algorithm that able to detect and utilize memory hierarchy resource to relieve the problem. To maintain a near optimal operating environment for SSDs, the framework is able to orchastrate data chunks across different memory layers to facilitate the read algorithm. Compared to existing tools, our framework achieves up to 50% read performance improvement when tested on datasets from real-world scientific simulations.}, journal={2015 IEEE INTERNATIONAL CONFERENCE ON CLUSTER COMPUTING - CLUSTER 2015}, author={Zhang, Wenzhao and Tang, Houjun and Zou, Xiaocheng and Harenberg, Steven and Liu, Qing and Klasky, Scott and Samatova, Nagiza F.}, year={2015}, pages={66–69} } @article{zou_wu_boyuka_martin_byna_tang_bansal_ligocki_johansen_samatova_2015, title={Parallel In Situ Detection of Connected Components in Adaptive Mesh Refinement Data}, ISSN={["2376-4414"]}, DOI={10.1109/ccgrid.2015.154}, abstractNote={Adaptive Mesh Refinement (AMR) represents a significant advance for scientific simulation codes, greatly reducing memory and compute requirements by dynamically varying simulation resolution over space and time. As simulation codes transition to AMR, existing analysis algorithms must also make this transition. One such algorithm, connected component detection, is of vital importance in many simulation and analysis contexts, with some simulation codes even relying on parallel, in situ connected component detection for correctness. Yet, current detection algorithms designed for uniform meshes are not applicable to hierarchical, non-uniform AMR, and to the best of our knowledge, AMR connected component detection has not been explored in the literature. Therefore, in this paper, we formally define the general problem of connected component detection for AMR, and present a general solution. Beyond solving the general detection problem, achieving viable in situ detection performance is even more challenging. The core issue is the conflict between the communication-intensive nature of connected component detection (in general, and especially for AMR data) and the requirement that in situ processes incur minimal performance impact on the co-located simulation. We address this challenge by presenting the first connected component detection methodology for structured AMR that is applicable in a parallel, in situ context. Our key strategy is the incorporation of an multi-phase AMR-aware communication pattern that synchronizes connectivity information across the AMR hierarchy. In addition, we distil our methodology to a generic framework within the Combo AMR infrastructure, making connected component detection services available for many existing applications. We demonstrate our method's efficacy by showing its ability to detect ice calving events in real time within the real-world BISICLES ice sheet modelling code. Results show up to a 6.8x speedup of our algorithm over the existing specialized BISICLES algorithm. We also show scalability results for our method up to 4,096 cores using a parallel Combo-based benchmark.}, journal={2015 15TH IEEE/ACM INTERNATIONAL SYMPOSIUM ON CLUSTER, CLOUD AND GRID COMPUTING}, author={Zou, Xiaocheng and Wu, Kesheng and Boyuka, David A. and Martin, Daniel F. and Byna, Suren and Tang, Houjun and Bansal, Kushal and Ligocki, Terry J. and Johansen, Hans and Samatova, Nagiza F.}, year={2015}, pages={302–312} } @article{boyuka_tang_bansal_zou_klasky_samatova_2015, title={The Hyperdyadic Index and Generalized Indexing and Query with PIQUE}, DOI={10.1145/2791347.2791374}, abstractNote={Many scientists rely on indexing and query to identify trends and anomalies within extreme-scale scientific data. Compressed bitmap indexing (e.g., FastBit) is the go-to indexing method for many scientific datasets and query workloads. Recently, the ALACRITY compressed inverted index was shown as a viable alternative approach. Notably, though FastBit and ALACRITY employ very different data structures (inverted list vs. bitmap) and binning methods (bit-wise vs. decimal-precision), close examination reveals marked similarities in index structure. Motivated by this observation, we ask two questions. First, "Can we generalize FastBit and ALACRITY to an index model encompassing both?" And second, if so, "Can such a generalized framework enable other, new indexing methods?" This paper answers both questions in the affrmative. First, we present PIQUE, a Parallel Indexing and Query Unified Engine, based on formal mathematical decomposition of the indexing process. PIQUE factors out commonalities in indexing, employing algorithmic/data structure "plugins" to mix orthogonal indexing concepts such as FastBit compressed bitmaps with ALACRITY binning, all within one framework. Second, we define the hyperdyadic tree index, distinct from both bitmap and inverted indexes, demonstrating good index compression while maintaining high query performance. We implement the hyperdyadic tree index within PIQUE, reinforcing our unified indexing model. We conduct a performance study of the hyperdyadic tree index vs. WAH compressed bitmaps, both within PIQUE and compared to FastBit, a state-of-the-art bitmap index system. The hyperdyadic tree index shows a 1.14-1.90x storage reduction vs. compressed bitmaps, with comparable or better query performance under most scenarios tested.}, journal={PROCEEDINGS OF THE 27TH INTERNATIONAL CONFERENCE ON SCIENTIFIC AND STATISTICAL DATABASE MANAGEMENT}, author={Boyuka, David A., II and Tang, Houjun and Bansal, Kushal and Zou, Xiaocheng and Klasky, Scott and Samatova, Nagiza F.}, year={2015} } @inproceedings{schendel_harenberg_tang_vishwanath_papka_samatova_2013, title={A generic high-performance method for deinterleaving scientific data}, volume={8097}, DOI={10.1007/978-3-642-40047-6_58}, abstractNote={High-performance and energy-efficient data management applications are a necessity for HPC systems due to the extreme scale of data produced by high fidelity scientific simulations that these systems support. Data layout in memory hugely impacts the performance. For better performance, most simulations interleave variables in memory during their calculation phase, but deinterleave the data for subsequent storage and analysis. As a result, efficient data deinterleaving is critical; yet, common deinterleaving methods provide inefficient throughput and energy performance. To address this problem, we propose a deinterleaving method that is high performance, energy efficient, and generic to any data type. To the best of our knowledge, this is the first deinterleaving method that 1) exploits data cache prefetching, 2) reduces memory accesses, and 3) optimizes the use of complete cache line writes. When evaluated against conventional deinterleaving methods on 105 STREAM standard micro-benchmarks, our method always improved throughput and throughput/watt on multi-core systems. In the best case, our deinterleaving method improved throughput up to 26.2x and throughput/watt up to 7.8x.}, booktitle={Euro-par 2013 parallel processing}, author={Schendel, E. R. and Harenberg, S. and Tang, H. J. and Vishwanath, V. and Papka, M. E. and Samatova, N. F.}, year={2013}, pages={571–582} } @book{jenkins_zou_tang_kimpe_ross_samatova, title={Parallel data layout optimization of scientific data through access-driven replication}, journal={Technical Report- Not held in TRLN member libraries}, author={Jenkins, J. P. and Zou, X. and Tang, H. and Kimpe, D. and Ross, R. and Samatova, N. F.} }