@article{zou_wu_boyuka_martin_byna_tang_bansal_ligocki_johansen_samatova_2015, title={Parallel In Situ Detection of Connected Components in Adaptive Mesh Refinement Data}, ISSN={["2376-4414"]}, DOI={10.1109/ccgrid.2015.154}, abstractNote={Adaptive Mesh Refinement (AMR) represents a significant advance for scientific simulation codes, greatly reducing memory and compute requirements by dynamically varying simulation resolution over space and time. As simulation codes transition to AMR, existing analysis algorithms must also make this transition. One such algorithm, connected component detection, is of vital importance in many simulation and analysis contexts, with some simulation codes even relying on parallel, in situ connected component detection for correctness. Yet, current detection algorithms designed for uniform meshes are not applicable to hierarchical, non-uniform AMR, and to the best of our knowledge, AMR connected component detection has not been explored in the literature. Therefore, in this paper, we formally define the general problem of connected component detection for AMR, and present a general solution. Beyond solving the general detection problem, achieving viable in situ detection performance is even more challenging. The core issue is the conflict between the communication-intensive nature of connected component detection (in general, and especially for AMR data) and the requirement that in situ processes incur minimal performance impact on the co-located simulation. We address this challenge by presenting the first connected component detection methodology for structured AMR that is applicable in a parallel, in situ context. Our key strategy is the incorporation of an multi-phase AMR-aware communication pattern that synchronizes connectivity information across the AMR hierarchy. In addition, we distil our methodology to a generic framework within the Combo AMR infrastructure, making connected component detection services available for many existing applications. We demonstrate our method's efficacy by showing its ability to detect ice calving events in real time within the real-world BISICLES ice sheet modelling code. Results show up to a 6.8x speedup of our algorithm over the existing specialized BISICLES algorithm. We also show scalability results for our method up to 4,096 cores using a parallel Combo-based benchmark.}, journal={2015 15TH IEEE/ACM INTERNATIONAL SYMPOSIUM ON CLUSTER, CLOUD AND GRID COMPUTING}, author={Zou, Xiaocheng and Wu, Kesheng and Boyuka, David A. and Martin, Daniel F. and Byna, Suren and Tang, Houjun and Bansal, Kushal and Ligocki, Terry J. and Johansen, Hans and Samatova, Nagiza F.}, year={2015}, pages={302–312} } @article{boyuka_tang_bansal_zou_klasky_samatova_2015, title={The Hyperdyadic Index and Generalized Indexing and Query with PIQUE}, DOI={10.1145/2791347.2791374}, abstractNote={Many scientists rely on indexing and query to identify trends and anomalies within extreme-scale scientific data. Compressed bitmap indexing (e.g., FastBit) is the go-to indexing method for many scientific datasets and query workloads. Recently, the ALACRITY compressed inverted index was shown as a viable alternative approach. Notably, though FastBit and ALACRITY employ very different data structures (inverted list vs. bitmap) and binning methods (bit-wise vs. decimal-precision), close examination reveals marked similarities in index structure. Motivated by this observation, we ask two questions. First, "Can we generalize FastBit and ALACRITY to an index model encompassing both?" And second, if so, "Can such a generalized framework enable other, new indexing methods?" This paper answers both questions in the affrmative. First, we present PIQUE, a Parallel Indexing and Query Unified Engine, based on formal mathematical decomposition of the indexing process. PIQUE factors out commonalities in indexing, employing algorithmic/data structure "plugins" to mix orthogonal indexing concepts such as FastBit compressed bitmaps with ALACRITY binning, all within one framework. Second, we define the hyperdyadic tree index, distinct from both bitmap and inverted indexes, demonstrating good index compression while maintaining high query performance. We implement the hyperdyadic tree index within PIQUE, reinforcing our unified indexing model. We conduct a performance study of the hyperdyadic tree index vs. WAH compressed bitmaps, both within PIQUE and compared to FastBit, a state-of-the-art bitmap index system. The hyperdyadic tree index shows a 1.14-1.90x storage reduction vs. compressed bitmaps, with comparable or better query performance under most scenarios tested.}, journal={PROCEEDINGS OF THE 27TH INTERNATIONAL CONFERENCE ON SCIENTIFIC AND STATISTICAL DATABASE MANAGEMENT}, author={Boyuka, David A., II and Tang, Houjun and Bansal, Kushal and Zou, Xiaocheng and Klasky, Scott and Samatova, Nagiza F.}, year={2015} } @article{lakshminarasimhan_zou_boyuka_pendse_jenkins_vishwanath_papka_klasky_samatova_2014, title={DIRAQ: scalable in situ data- and resource-aware indexing for optimized query performance}, volume={17}, ISSN={["1573-7543"]}, DOI={10.1007/s10586-014-0358-z}, number={4}, journal={CLUSTER COMPUTING-THE JOURNAL OF NETWORKS SOFTWARE TOOLS AND APPLICATIONS}, author={Lakshminarasimhan, Sriram and Zou, Xiaocheng and Boyuka, David A., II and Pendse, Saurabh V. and Jenkins, John and Vishwanath, Venkatram and Papka, Michael E. and Klasky, Scott and Samatova, Nagiza F.}, year={2014}, month={Dec}, pages={1101–1119} } @article{boyuka_lakshminarasimhan_zou_gong_jenkins_schendel_podhorszki_liu_klasky_samatova_2014, title={Transparent In Situ Data Transformations in ADIOS}, ISSN={["2376-4414"]}, DOI={10.1109/ccgrid.2014.73}, abstractNote={Though an abundance of novel "data transformation" technologies have been developed (such as compression, level-of-detail, layout optimization, and indexing), there remains a notable gap in the adoption of such services by scientific applications. In response, we develop an in situ data transformation framework in the ADIOS I/O middleware with a "plug in" interface, thus greatly simplifying both the deployment and use of data transform services in scientific applications. Our approach ensures user-transparency, runtime-configurability, compatibility with existing I/O optimizations, and the potential for exploiting read-optimizing transforms (such as level-of-detail) to achieve I/O reduction. We demonstrate use of our framework with the QLG simulation at up to 8,192 cores on the leadership-class Titan supercomputer, showing negligible overhead. We also explore the read performance implications of data transforms with respect to parameters such as chunk size, access pattern, and the "opacity" of different transform methods including compression and level-of-detail.}, journal={2014 14TH IEEE/ACM INTERNATIONAL SYMPOSIUM ON CLUSTER, CLOUD AND GRID COMPUTING (CCGRID)}, author={Boyuka, David A., II and Lakshminarasimhan, Sriram and Zou, Xiaocheng and Gong, Zhenhuan and Jenkins, John and Schendel, Eric R. and Podhorszki, Norbert and Liu, Qing and Klasky, Scott and Samatova, Nagiza F.}, year={2014}, pages={256–266} } @article{gong_boyuka_zou_liu_podhorszki_klasky_ma_samatova_2013, title={PARLO: PArallel Run-time Layout Optimization for Scientific Data Explorations with Heterogeneous Access Patterns}, ISBN={["978-1-4673-6465-2"]}, ISSN={["2376-4414"]}, DOI={10.1109/ccgrid.2013.58}, abstractNote={The size and scope of cutting-edge scientific simulations are growing much faster than the I/O and storage capabilities of their run-time environments. The growing gap is exacerbated by exploratory, data-intensive analytics, such as querying simulation data with multivariate, spatio-temporal constraints, which induces heterogeneous access patterns that stress the performance of the underlying storage system. Previous work addresses data layout and indexing techniques to improve query performance for a single access pattern, which is not sufficient for complex analytics jobs. We present PARLO a parallel run-time layout optimization framework, to achieve multi-level data layout optimization for scientific applications at run-time before data is written to storage. The layout schemes optimize for heterogeneous access patterns with user-specified priorities. PARLO is integrated with ADIOS, a high-performance parallel I/O middleware for large-scale HPC applications, to achieve user-transparent, light-weight layout optimization for scientific datasets. It offers simple XML-based configuration for users to achieve flexible layout optimization without the need to modify or recompile application codes. Experiments show that PARLO improves performance by 2 to 26 times for queries with heterogeneous access patterns compared to state-of-the-art scientific database management systems. Compared to traditional post-processing approaches, its underlying run-time layout optimization achieves a 56% savings in processing time and a reduction in storage overhead of up to 50%. PARLO also exhibits a low run-time resource requirement, while also limiting the performance impact on running applications to a reasonable level.}, journal={PROCEEDINGS OF THE 2013 13TH IEEE/ACM INTERNATIONAL SYMPOSIUM ON CLUSTER, CLOUD AND GRID COMPUTING (CCGRID 2013)}, author={Gong, Zhenhuan and Boyuka, David A., II and Zou, Xiaocheng and Liu, Qing and Podhorszki, Norbert and Klasky, Scott and Ma, Xiaosong and Samatova, Nagiza F.}, year={2013}, pages={343–351} } @inproceedings{jenkins_schendel_lakshminarasimhan_boyuka_rogers_ethier_ross_klasky_samatova_2012, title={Byte-precision level of detail processing for variable precision analytics}, DOI={10.1109/sc.2012.26}, abstractNote={I/O bottlenecks in HPC applications are becoming a more pressing problem as compute capabilities continue to outpace I/O capabilities. While double-precision simulation data often must be stored losslessly, the loss of some of the fractional component may introduce acceptably small errors to many types of scientific analyses. Given this observation, we develop a precision level of detail (APLOD) library, which partitions double-precision datasets along user-defined byte boundaries. APLOD parameterizes the analysis accuracy-I/O performance tradeoff, bounds maximum relative error, maintains I/O access patterns compared to full precision, and operates with low overhead. Using ADIOS as an I/O use-case, we show proportional reduction in disk access time to the degree of precision. Finally, we show the effects of partial precision analysis on accuracy for operations such as k-means and Fourier analysis, finding a strong applicability for the use of varying degrees of precision to reduce the cost of analyzing extreme-scale data.}, booktitle={International conference for high performance computing networking}, author={Jenkins, J. and Schendel, E. R. and Lakshminarasimhan, S. and Boyuka, D. A. and Rogers, T. and Ethier, S. and Ross, R. and Klasky, S. and Samatova, N. F.}, year={2012} } @inproceedings{jenkins_arkatkar_lakshminarasimhan_boyuka_schendel_shah_ethier_chang_chen_kolla_et al., title={ALACRITY: Analytics-driven lossless data compression for rapid in-situ indexing, storing, and querying}, volume={8220}, booktitle={Transactions on large-scale data- and knowledge- centered systems x: special issue on database- and expert-systems applications}, author={Jenkins, J. and Arkatkar, I. and Lakshminarasimhan, S. and Boyuka, D. A. and Schendel, E. R. and Shah, N. and Ethier, S. and Chang, C. S. and Chen, J. and Kolla, H. and et al.}, pages={95–114} }