@article{aji_panwar_ji_murthy_chabbi_balaji_bisset_dinan_feng_mellor-crummey_et al._2016, title={MPI-ACC: accelerator-aware MPI for scientific applications}, volume={27}, number={5}, journal={IEEE Transactions on Parallel and Distributed Systems}, author={Aji, A. M. and Panwar, L. S. and Ji, F. and Murthy, K. and Chabbi, M. and Balaji, P. and Bisset, K. R. and Dinan, J. and Feng, W. C. and Mellor-Crummey, J. and et al.}, year={2016}, pages={1401–1414} } @article{gong_boyuka_zou_liu_podhorszki_klasky_ma_samatova_2013, title={PARLO: PArallel Run-time Layout Optimization for Scientific Data Explorations with Heterogeneous Access Patterns}, ISBN={["978-1-4673-6465-2"]}, ISSN={["2376-4414"]}, DOI={10.1109/ccgrid.2013.58}, abstractNote={The size and scope of cutting-edge scientific simulations are growing much faster than the I/O and storage capabilities of their run-time environments. The growing gap is exacerbated by exploratory, data-intensive analytics, such as querying simulation data with multivariate, spatio-temporal constraints, which induces heterogeneous access patterns that stress the performance of the underlying storage system. Previous work addresses data layout and indexing techniques to improve query performance for a single access pattern, which is not sufficient for complex analytics jobs. We present PARLO a parallel run-time layout optimization framework, to achieve multi-level data layout optimization for scientific applications at run-time before data is written to storage. The layout schemes optimize for heterogeneous access patterns with user-specified priorities. PARLO is integrated with ADIOS, a high-performance parallel I/O middleware for large-scale HPC applications, to achieve user-transparent, light-weight layout optimization for scientific datasets. It offers simple XML-based configuration for users to achieve flexible layout optimization without the need to modify or recompile application codes. Experiments show that PARLO improves performance by 2 to 26 times for queries with heterogeneous access patterns compared to state-of-the-art scientific database management systems. Compared to traditional post-processing approaches, its underlying run-time layout optimization achieves a 56% savings in processing time and a reduction in storage overhead of up to 50%. PARLO also exhibits a low run-time resource requirement, while also limiting the performance impact on running applications to a reasonable level.}, journal={PROCEEDINGS OF THE 2013 13TH IEEE/ACM INTERNATIONAL SYMPOSIUM ON CLUSTER, CLOUD AND GRID COMPUTING (CCGRID 2013)}, author={Gong, Zhenhuan and Boyuka, David A., II and Zou, Xiaocheng and Liu, Qing and Podhorszki, Norbert and Klasky, Scott and Ma, Xiaosong and Samatova, Nagiza F.}, year={2013}, pages={343–351} } @inproceedings{ji_lin_ma_2013, title={RSVM: A region-based software virtual memory for GPU}, booktitle={International conference on parallel architectures and compilation}, author={Ji, F. and Lin, H. S. and Ma, X. S.}, year={2013}, pages={269–278} } @inproceedings{ji_aji_dinan_buntinas_balaji_feng_ma_2012, title={Efficient Intranode communication in GPU-accelerated systems}, booktitle={Ieee international symposium on parallel and distributed processing}, author={Ji, F. and Aji, A. M. and Dinan, J. and Buntinas, D. and Balaji, P. and Feng, W. C. and Ma, X. S.}, year={2012}, pages={1838–1847} } @article{lin_ma_feng_2012, title={Reliable MapReduce computing on opportunistic resources}, volume={15}, number={2}, journal={Cluster Computing-The Journal of Networks Software Tools and Applications}, author={Lin, H. S. and Ma, X. S. and Feng, W. C.}, year={2012}, pages={145–161} } @article{lin_ma_feng_samatova_2011, title={Coordinating Computation and I/O in Massively Parallel Sequence Search}, volume={22}, ISSN={["1558-2183"]}, DOI={10.1109/tpds.2010.101}, abstractNote={With the explosive growth of genomic information, the searching of sequence databases has emerged as one of the most computation and data-intensive scientific applications. Our previous studies suggested that parallel genomic sequence-search possesses highly irregular computation and I/O patterns. Effectively addressing these runtime irregularities is thus the key to designing scalable sequence-search tools on massively parallel computers. While the computation scheduling for irregular scientific applications and the optimization of noncontiguous file accesses have been well-studied independently, little attention has been paid to the interplay between the two. In this paper, we systematically investigate the computation and I/O scheduling for data-intensive, irregular scientific applications within the context of genomic sequence search. Our study reveals that the lack of coordination between computation scheduling and I/O optimization could result in severe performance issues. We then propose an integrated scheduling approach that effectively improves sequence-search throughput by gracefully coordinating the dynamic load balancing of computation and high-performance noncontiguous I/O.}, number={4}, journal={IEEE TRANSACTIONS ON PARALLEL AND DISTRIBUTED SYSTEMS}, author={Lin, Heshan and Ma, Xiaosong and Feng, Wuchun and Samatova, Nagiza F.}, year={2011}, month={Apr}, pages={529–543} } @article{li_ma_yoginath_kora_samatova_2011, title={Transparent runtime parallelization of the R scripting language}, volume={71}, ISSN={["1096-0848"]}, DOI={10.1016/j.jpdc.2010.08.013}, abstractNote={Scripting languages such as R and Matlab are widely used in scientific data processing. As the data volume and the complexity of analysis tasks both grow, sequential data processing using these tools often becomes the bottleneck in scientific workflows. We describe pR, a runtime framework for automatic and transparent parallelization of the popular R language used in statistical computing. Recognizing scripting languages’ interpreted nature and data analysis codes’ use pattern, we propose several novel techniques: (1) applying parallelizing compiler technology to runtime, whole-program dependence analysis of scripting languages, (2) incremental code analysis assisted with evaluation results, and (3) runtime parallelization of file accesses. Our framework does not require any modification to either the source code or the underlying R implementation. Experimental results demonstrate that pR can exploit both task and data parallelism transparently and overall has better performance as well as scalability compared to an existing parallel R package that requires code modification.}, number={2}, journal={JOURNAL OF PARALLEL AND DISTRIBUTED COMPUTING}, author={Li, Jiangtian and Ma, Xiaosong and Yoginath, Srikanth and Kora, Guruprasad and Samatova, Nagiza F.}, year={2011}, month={Feb}, pages={157–168} } @article{ma_vazhkudai_zhang_2009, title={Improving Data Availability for Better Access Performance: A Study on Caching Scientific Data on Distributed Desktop Workstations}, volume={7}, ISSN={["1572-9184"]}, DOI={10.1007/s10723-009-9122-7}, abstractNote={Client-side data caching serves as an excellent mechanism to store and analyze the rapidly growing scientific data, motivating distributed, client-side caches built from unreliable desktop storage contributions to store and access large scientific data. They offer several desirable properties, such as performance impedance matching, improved space utilization, and high parallel I/O bandwidth. In this context, we are faced with two key challenges: (1) the finite amount of contributed cache space is stretched by the ever increasing scientific dataset sizes and (2) the transient nature of volunteered storage nodes impacts data availability. In this article, we address these challenges by exploiting the existence of external, primary copies of datasets. We propose a novel combination of prefix caching, collective download, and remote partial data recovery (RPDR), to deal with optimal cache space consumption and storage node volatility. Our evaluation, performed on our FreeLoader prototype, indicates that prefix caching can significantly improve the cache hit rate and partial data recovery is better than (or comparable to) many persistent-data availability techniques.}, number={4}, journal={JOURNAL OF GRID COMPUTING}, author={Ma, Xiaosong and Vazhkudai, Sudharshan S. and Zhang, Zhe}, year={2009}, month={Dec}, pages={419–438} } @inproceedings{gong_ramaswamy_gu_ma_2009, title={SigLM: signature-driven load management for cloud computing infrastructures}, booktitle={Iwqos: 2009 ieee 17th international workshop on quality of service}, author={Gong, Z. H. and Ramaswamy, P. and Gu, X. H. and Ma, X. S.}, year={2009}, pages={226–234} } @inproceedings{lin_balaji_poole_sosa_ma_feng_2008, title={Massively parallel genomic sequence search on the Blue Gene/P architecture}, booktitle={International Conference for High Performance Computing, Networking, Storage and Analysis}, author={Lin, H. S. and Balaji, P. and Poole, R. and Sosa, C. and Ma, X. S. and Feng, W. C.}, year={2008}, pages={522–532} } @article{ma_lee_winslett_2006, title={High-level buffering for hiding periodic output cost in scientific simulations}, volume={17}, ISSN={["1558-2183"]}, DOI={10.1109/TPDS.2006.36}, abstractNote={Scientific applications often need to write out large arrays and associated metadata periodically for visualization or restart purposes. In this paper, we present active buffering, a high-level transparent buffering scheme for collective I/O, in which processors actively organize their idle memory into a hierarchy of buffers for periodic output data. It utilizes idle memory on the processors, yet makes no assumption regarding runtime memory availability. Active buffering can perform background I/O while the computation is going on, is extensible to remote I/O for more efficient data migration, and can be implemented in a portable style in today's parallel I/O libraries. It can also mask performance problems of scientific data formats used by many scientists. Performance experiments with both synthetic benchmarks and real simulation codes on multiple platforms show that active buffering can greatly reduce the visible I/O cost from the application's point of view.}, number={3}, journal={IEEE TRANSACTIONS ON PARALLEL AND DISTRIBUTED SYSTEMS}, author={Ma, XS and Lee, J and Winslett, M}, year={2006}, month={Mar}, pages={193–204} }