@article{zhao_yang_bolnykh_harenberg_korchiev_yerramsetty_vellanki_kodumagulla_samatova_2021, title={Predictive models with end user preference}, volume={8}, ISSN={["1932-1872"]}, DOI={10.1002/sam.11545}, abstractNote={AbstractClassical machine learning models typically try to optimize the model based on the most discriminatory features of the data; however, they do not usually account for end user preferences. In certain applications, this can be a serious issue as models not aware of user preferences could become costly, untrustworthy, or privacy‐intrusive to use, thus becoming irrelevant and/or uninterpretable. Ideally, end users with domain knowledge could propose preferable features that the predictive model could then take into account. In this paper, we propose a generic modeling method that respects end user preferences via a relative ranking system to express multi‐criteria preferences and a regularization term in the model's objective function to incorporate the ranked preferences. In a more generic perspective, this method is able to plug user preferences into existing predictive models without creating completely new ones. We implement this method in the context of decision trees and are able to achieve a comparable classification accuracy while reducing the use of undesirable features.}, journal={STATISTICAL ANALYSIS AND DATA MINING}, author={Zhao, Yifan and Yang, Xian and Bolnykh, Carolina and Harenberg, Steve and Korchiev, Nodirbek and Yerramsetty, Saavan Raj and Vellanki, Bhanu Prasad and Kodumagulla, Ramakanth and Samatova, Nagiza F.}, year={2021}, month={Aug} } @article{meleshko_samatova_2020, title={Group classification of the two-dimensional shallow water equations with the beta-plane approximation of coriolis parameter in Lagrangian coordinates}, volume={90}, ISSN={["1878-7274"]}, DOI={10.1016/j.cnsns.2020.105337}, abstractNote={Two-dimensional shallow water equations with uneven bottom and a Coriolis parameter f=f0+βy, (β ≠ 0) in mass Lagrangian coordinates are studied in this paper. The equations describing these flows are reduced to two Euler–Lagrange equations. The paper provides a complete group classification of the equations and applications of Noether's theorem for constructing conservation laws.}, journal={COMMUNICATIONS IN NONLINEAR SCIENCE AND NUMERICAL SIMULATION}, author={Meleshko, S. and Samatova, N. F.}, year={2020}, month={Nov} } @article{kaptsov_meleshko_samatova_2020, title={The one-dimensional Green-Naghdi equations with a time dependent bottom topography and their conservation laws}, volume={32}, ISSN={["1089-7666"]}, DOI={10.1063/5.0031238}, abstractNote={This paper deals with the one-dimensional Green–Naghdi equations describing the behavior of fluid flow over an uneven bottom topography depending on time. Using Matsuno’s approach, the corresponding equations are derived in Eulerian coordinates. Further study is performed in Lagrangian coordinates. This study allowed us to find the general form of the Lagrangian corresponding to the analyzed equations. Then, Noether’s theorem is used to derive conservation laws. As some of the tools in the application of Noether’s theorem are admitted generators, a complete group classification of the Green–Naghdi equations with respect to the bottom depending on time is performed. Using Noether’s theorem, the found Lagrangians, and the group classification, conservation laws of the one-dimensional Green–Naghdi equations with uneven bottom topography depending on time are obtained.}, number={12}, journal={PHYSICS OF FLUIDS}, author={Kaptsov, E. and Meleshko, S. and Samatova, N. F.}, year={2020}, month={Dec} } @article{yang_he_xu_ni_jones_samatova_2018, title={An Intelligent and Hybrid Weighted Fuzzy Time Series Model Based on Empirical Mode Decomposition for Financial Markets Forecasting}, volume={10933}, ISBN={["978-3-319-95785-2"]}, ISSN={["1611-3349"]}, url={http://www.scopus.com/inward/record.url?eid=2-s2.0-85049883593&partnerID=MN8TOARS}, DOI={10.1007/978-3-319-95786-9_8}, abstractNote={Given the potentially high impact of accurate financial market forecasting, there has been considerable research on time series analysis for financial markets. We present a new Intelligent Hybrid Weighted Fuzzy (IHWF) time series model to improve forecasting accuracy in financial markets, which are complex nonlinear time-sensitive systems, influenced by many factors. The IHWF model uniquely combines Empirical Mode Decomposition (EMD) with a novel weighted fuzzy time series method. The model is enhanced by an Adaptive Sine-Cosine Human Learning Optimization (ASCHLO) algorithm to help find optimal parameters that further improve forecasting performance. EMD is a time series processing technique to extract the possible modes of various kinds of institutional and individual investors and traders, embedded in a given time series. Subsequently, the proposed weighted fuzzy time series method with chronological order based frequency and Neighborhood Volatility Direction (NVD) is analyzed and integrated with ASCHLO to determine the effective universe discourse, intervals and weights. In order to evaluate the performance of proposed model, we evaluate actual trading data of Taiwan Capitalization Weighted Stock Index (TAIEX) from 1990 to 2004 and the findings are compared with other well-known forecasting models. The results show that the proposed method outperforms the listing models in terms of accuracy.}, journal={ADVANCES IN DATA MINING: APPLICATIONS AND THEORETICAL ASPECTS (ICDM 2018)}, author={Yang, Ruixin and He, Junyi and Xu, Mingyang and Ni, Haoqi and Jones, Paul and Samatova, Nagiza}, year={2018}, pages={104–118} } @article{xu_yang_jones_samatova_2018, title={Mining Aspect-Specific Opinions from Online Reviews Using a Latent Embedding Structured Topic Model}, volume={10762}, ISSN={["1611-3349"]}, url={http://www.scopus.com/inward/record.url?eid=2-s2.0-85055688176&partnerID=MN8TOARS}, DOI={10.1007/978-3-319-77116-8_15}, abstractNote={Online reviews often contain user’s specific opinions on aspects (features) of items. These opinions are very useful to merchants and customers, but manually extracting them is time-consuming. Several topic models have been proposed to simultaneously extract item aspects and user’s opinions on the aspects, as well as to detect sentiment associated with the opinions. However, existing models tend to find poor aspect-opinion associations when limited examples of the required word co-occurrences are available in corpus. These models often also assign incorrect sentiment to words. In this paper, we propose a Latent embedding structured Opinion mining Topic model, called the LOT, which can simultaneously discover relevant aspect-level specific opinions from small or large numbers of reviews and to assign accurate sentiment to words. Experimental results for topic coherence, document sentiment classification, and a human evaluation all show that our proposed model achieves significant improvements over several state-of-the-art baselines.}, journal={COMPUTATIONAL LINGUISTICS AND INTELLIGENT TEXT PROCESSING, CICLING 2017, PT II}, author={Xu, Mingyang and Yang, Ruixin and Jones, Paul and Samatova, Nagiza F.}, year={2018}, pages={195–210} } @article{sohn_shpanskaya_lucas_petrella_saykin_tanzi_samatova_doraiswamy_2018, title={Sex Differences in Cognitive Decline in Subjects with High Likelihood of Mild Cognitive Impairment due to Alzheimer's disease}, volume={8}, ISSN={["2045-2322"]}, DOI={10.1038/s41598-018-25377-w}, abstractNote={AbstractSex differences in Alzheimer’s disease (AD) biology and progression are not yet fully characterized. The goal of this study is to examine the effect of sex on cognitive progression in subjects with high likelihood of mild cognitive impairment (MCI) due to Alzheimer’s and followed up to 10 years in the Alzheimer’s Disease Neuroimaging Initiative (ADNI). Cerebrospinal fluid total-tau and amyloid-beta (Aβ42) ratio values were used to sub-classify 559 MCI subjects (216 females, 343 males) as having “high” or “low” likelihood for MCI due to Alzheimer’s. Data were analyzed using mixed-effects models incorporating all follow-ups. The worsening from baseline in Alzheimer’s Disease Assessment Scale-Cognitive score (mean, SD) (9 ± 12) in subjects with high likelihood of MCI due to Alzheimer’s was markedly greater than that in subjects with low likelihood (1 ± 6, p < 0.0001). Among MCI due to AD subjects, the mean worsening in cognitive score was significantly greater in females (11.58 ± 14) than in males (6.87 ± 11, p = 0.006). Our findings highlight the need to further investigate these findings in other populations and develop sex specific timelines for Alzheimer’s disease progression.}, journal={SCIENTIFIC REPORTS}, author={Sohn, Dongwha and Shpanskaya, Katie and Lucas, Joseph E. and Petrella, Jeffrey R. and Saykin, Andrew J. and Tanzi, Rudolph E. and Samatova, Nagiza F. and Doraiswamy, P. Murali}, year={2018}, month={May} } @inbook{chaudhary_ranshous_samatova_2017, place={Cham, Switzerland}, series={Studies in Computational Intelligence}, title={A Community-Driven Graph Partitioning Method for Constraint-Based Causal Discovery}, ISBN={9783319721491 9783319721507}, ISSN={1860-949X 1860-9503}, url={http://dx.doi.org/10.1007/978-3-319-72150-7_21}, DOI={10.1007/978-3-319-72150-7_21}, abstractNote={Constraint-based (CB) methods are widely used for discovering causal relationships in observational data. The PC-stable algorithm is a prominent example of CB methods. A critical component of the PC-stable algorithm is to find d-separators and perform conditional independence (CI) tests to eliminate spurious causal relationships. While the pairwise CI tests are necessary for identifying causal relationships, the error rate, where true causal relationships are erroneously removed, increases with the number of tests performed. Efficiently searching for the true d-separator set is thus a critical component to increase the accuracy of the causal graph. To this end, we propose a novel recursive algorithm for constructing causal graphs, based on a two-phase divide and conquer strategy. In phase one, we recursively partition the undirected graph using community detection, and subsequently construct partial skeletons from each partition. Phase two uses a bottom-up approach to merge the subgraph skeletons, ultimately yielding the full causal graph. Simulations on several real-world data sets show that our approach effectively finds the d-separators, leading to a significant improvement in the quality of causal graphs.}, booktitle={Complex Networks & Their Applications VI. COMPLEX NETWORKS 2017}, publisher={Springer International Publishing}, author={Chaudhary, Mandar S. and Ranshous, Stephen and Samatova, Nagiza F.}, editor={Cherifi, C. and Cherifi, H. and Karsai, M. and Musolesi, M.Editors}, year={2017}, month={Nov}, pages={253–264}, collection={Studies in Computational Intelligence} } @article{xu_yang_harenberg_samatova_2017, title={A Lifelong Learning Topic Model Structured Using Latent Embeddings}, ISSN={["2325-6516"]}, url={http://www.scopus.com/inward/record.url?eid=2-s2.0-85018319146&partnerID=MN8TOARS}, DOI={10.1109/icsc.2017.15}, abstractNote={We propose a latent-embedding-structured lifelong learning topic model, called the LLT model, to discover coherent topics from a corpus. Specifically, we exploit latent word embeddings to structure our model and mine word correlation knowledge to assist in topic modeling. During each learning iteration, our model learns new word embeddings based on the topics generated in the previous learning iteration. Experimental results demonstrate that our LLT model is able to generate more coherent topics than state-of-the-art methods.}, journal={2017 11TH IEEE INTERNATIONAL CONFERENCE ON SEMANTIC COMPUTING (ICSC)}, author={Xu, Mingyang and Yang, Ruixin and Harenberg, Steve and Samatova, Nagiza F.}, year={2017}, pages={260–261} } @article{yang_xu_he_ranshous_samatova_2017, title={An Intelligent Weighted Fuzzy Time Series Model Based on a Sine-Cosine Adaptive Human Learning Optimization Algorithm and Its Application to Financial Markets Forecasting}, volume={10604}, ISBN={["978-3-319-69178-7"]}, ISSN={["1611-3349"]}, url={http://www.scopus.com/inward/record.url?eid=2-s2.0-85033718976&partnerID=MN8TOARS}, DOI={10.1007/978-3-319-69179-4_42}, abstractNote={Financial forecasting is an extremely challenging task given the complex, nonlinear nature of financial market systems. To overcome this challenge, we present an intelligent weighted fuzzy time series model for financial forecasting, which uses a sine-cosine adaptive human learning optimization (SCHLO) algorithm to search for the optimal parameters for forecasting. New weighted operators that consider frequency based chronological order and stock volume are analyzed, and SCHLO is integrated to determine the effective intervals and weighting factors. Furthermore, a novel short-term trend repair operation is developed to complement the final forecasting process. Finally, the proposed model is applied to four world major trading markets: the Dow Jones Index (DJI), the German Stock Index (DAX), the Japanese Stock Index (NIKKEI), and Taiwan Stock Index (TAIEX). Experimental results show that our model is consistently more accurate than the state-of-the-art baseline methods. The easy implementation and effective forecasting performance suggest our proposed model could be a favorable market application prospect.}, journal={ADVANCED DATA MINING AND APPLICATIONS, ADMA 2017}, author={Yang, Ruixin and Xu, Mingyang and He, Junyi and Ranshous, Stephen and Samatova, Nagiza F.}, year={2017}, pages={595–607} } @inbook{ranshous_chaudhary_samatova_2017, place={Cham, Switzerland}, series={Studies in Computational Intelligence}, title={Efficient Outlier Detection in Hyperedge Streams Using MinHash and Locality-Sensitive Hashing}, ISBN={9783319721491 9783319721507}, ISSN={1860-949X 1860-9503}, url={http://dx.doi.org/10.1007/978-3-319-72150-7_9}, DOI={10.1007/978-3-319-72150-7_9}, abstractNote={Mining outliers in graph data is a rapidly growing area of research. Traditional methods focus either on static graphs, or restrict relationships to be pairwise. In this work we address both of these limitations directly, and propose the first approach for mining outliers in hyperedge streams. Hyperedges, which generalize edges, faithfully capture higher order relationships that naturally occur in complex systems. Our model annotates every incoming hyperedge with an outlier score, which is based on the incident vertices and the historical relationships among them. Additionally, we describe an approximation scheme that ensures our model is suitable for being run in streaming environments. Experimental results on several real-world datasets show our model effectively identifies outliers, and that our approximation provides speedups between 33–775x.}, booktitle={Complex Networks & Their Applications VI. COMPLEX NETWORKS 2017}, publisher={Springer International Publishing}, author={Ranshous, Stephen and Chaudhary, Mandar and Samatova, Nagiza F.}, editor={Cherifi, C. and Cherifi, H. and Karsai, M. and Musolesi, M.Editors}, year={2017}, month={Nov}, pages={105–116}, collection={Studies in Computational Intelligence} } @inbook{ranshous_joslyn_kreyling_nowak_samatova_west_winters_2017, title={Exchange Pattern Mining in the Bitcoin Transaction Directed Hypergraph}, ISBN={9783319702773 9783319702780}, ISSN={0302-9743 1611-3349}, url={http://dx.doi.org/10.1007/978-3-319-70278-0_16}, DOI={10.1007/978-3-319-70278-0_16}, abstractNote={Bitcoin exchanges operate between digital and fiat currency networks, thus providing an opportunity to connect real-world identities to pseudonymous addresses, an important task for anti-money laundering efforts. We seek to characterize, understand, and identify patterns centered around exchanges in the context of a directed hypergraph model for Bitcoin transactions. We introduce the idea of motifs in directed hypergraphs, considering a particular 2-motif as a potential laundering pattern. We identify distinct statistical properties of exchange addresses related to the acquisition and spending of bitcoin. We then leverage this to build classification models to learn a set of discriminating features, and are able to predict if an address is owned by an exchange with $$>80\%$$ accuracy using purely structural features of the graph. Applying this classifier to the 2-motif patterns reveals a preponderance of inter-exchange activity, while not necessarily significant laundering patterns.}, booktitle={Financial Cryptography and Data Security}, publisher={Springer International Publishing}, author={Ranshous, Stephen and Joslyn, Cliff A. and Kreyling, Sean and Nowak, Kathleen and Samatova, Nagiza F. and West, Curtis L. and Winters, Samuel}, year={2017}, pages={248–263} } @inproceedings{xu_yang_ranshous_li_samatova_2018, title={Leveraging External Knowledge for Phrase-Based Topic Modeling}, url={http://www.scopus.com/inward/record.url?eid=2-s2.0-85048362739&partnerID=MN8TOARS}, DOI={10.1109/taai.2017.25}, abstractNote={Topic modeling has been widely used for extracting the major topics from a corpus. Each discovered topic contains a set of related individual words that describe the topic itself. The discovered topics summarize the major themes of the corpus. Recently, a few phrase-based topic models have been proposed, which simultaneously model phrases and topics. The topics discovered by these models consist of phrases besides individual words, as phrases are typically more meaningful. However, these models typically require large amounts of data to provide reliable statistics for phrase-based topic modeling, thus limiting their performance in scenarios with limited data. To address this limitation, we propose a knowledge-based topic model that incorporates two types of pre-identified external knowledge for topical phrase discovery: Phrase knowledge, and phrase correlation knowledge. Phrase knowledge guides the discovery of meaningful phrases by leveraging a set of pre-identified exemplary phrases; Phrase correlation knowledge guides the discovery of meaningful topics by exploiting a set of pre-identified pairs of related phrases. Experimental results show that our method outperforms the state-of-the-art baseline on both small and large datasets, extracting more meaningful phrases and coherent topics.}, booktitle={Proceedings - 2017 Conference on Technologies and Applications of Artificial Intelligence, TAAI 2017}, author={Xu, M. and Yang, R. and Ranshous, S. and Li, S. and Samatova, N.F.}, year={2018}, pages={29–32} } @inproceedings{yang_xu_jones_samatova_2018, title={Real time utility-based recommendation for revenue optimization via an adaptive online Top-K high utility itemsets mining model}, url={http://www.scopus.com/inward/record.url?eid=2-s2.0-85050186014&partnerID=MN8TOARS}, DOI={10.1109/fskd.2017.8393050}, abstractNote={Recommender Systems (RS) in e-commerce are typically used to suggest products to online shopping customers, and now play a key role in product marketing strategies for major online retailers, such as Walmart and Amazon. The main goal of such systems is to predict likely future customer desires and to trigger purchases through the timely provision of product recommendations. Therefore, RS have become indispensable tools for both customers and retailers. However, most existing RS recommend products from the point view of customers (i.e. likelihood of customer purchase) but ignore one of the most important business goals: the optimization of revenue. Consequently, there is an increasing need to learn utility patterns online and provide near real-time utility-based recommendations. To address these challenges, we first define the utility of recommendation sets and formulate the problem of real time utility-based recommendation. Next, we consider that online transaction streams are usually accompanied with flow fluctuation, and propose an Adaptive Online Top-K (RAOTK) high utility itemsets mining model to guide the utility-based recommendations. Additionally, three variants of this algorithm are described and we provide a structural comparison of the four algorithms with discussions on their advantages and limitations. Moreover, to make our model more personalized, we also take the buying power of customers into account and propose a simple but effective method to estimate the consumers' willingness to pay. Finally, extensive empirical results on real-world datasets show that the proposed model works effectively and outperforms several baselines.}, booktitle={ICNC-FSKD 2017 - 13th International Conference on Natural Computation, Fuzzy Systems and Knowledge Discovery}, author={Yang, R. and Xu, M. and Jones, P. and Samatova, N.}, year={2018}, pages={1859–1866} } @article{karpatne_atluri_faghmous_steinbach_banerjee_ganguly_shekhar_samatova_kumar_2017, title={Theory-Guided Data Science: A New Paradigm for Scientific Discovery from Data}, volume={29}, ISSN={["1558-2191"]}, DOI={10.1109/tkde.2017.2720168}, abstractNote={Data science models, although successful in a number of commercial domains, have had limited applicability in scientific problems involving complex physical phenomena. Theory-guided data science (TGDS) is an emerging paradigm that aims to leverage the wealth of scientific knowledge for improving the effectiveness of data science models in enabling scientific discovery. The overarching vision of TGDS is to introduce scientific consistency as an essential component for learning generalizable models. Further, by producing scientifically interpretable models, TGDS aims to advance our scientific understanding by discovering novel domain insights. Indeed, the paradigm of TGDS has started to gain prominence in a number of scientific disciplines such as turbulence modeling, material discovery, quantum chemistry, bio-medical science, bio-marker discovery, climate science, and hydrology. In this paper, we formally conceptualize the paradigm of TGDS and present a taxonomy of research themes in TGDS. We describe several approaches for integrating domain knowledge in different research themes using illustrative examples from different disciplines. We also highlight some of the promising avenues of novel research for realizing the full potential of theory-guided data science.}, number={10}, journal={IEEE TRANSACTIONS ON KNOWLEDGE AND DATA ENGINEERING}, author={Karpatne, Anuj and Atluri, Gowtham and Faghmous, James H. and Steinbach, Michael and Banerjee, Arindam and Ganguly, Auroop and Shekhar, Shashi and Samatova, Nagiza and Kumar, Vipin}, year={2017}, month={Oct}, pages={2318–2331} } @article{jones_medd_ramakrishnan_shah_keyton_samatova_2017, title={Towards Automatic Linkage of Knowledge Worker's Claims with Associated Evidence from Screenshots}, DOI={10.1109/bigdataservice.2017.21}, abstractNote={Knowledge workers are frequently subject to information overload. As a result, when looking to make analytic judgements, they may only have time to search for evidence that already matches their existing viewpoint, leading to confirmation bias. New computer systems are needed that can help users overcome this and other cognitive biases. As an enabling step towards such systems, the research community has developed instrumentation software that captures data to help better understand sensemaking processes and workflows. However, existing instrumentation approaches are limited by the need to write operating system-specific (and often application-specific) code to 'see' what the user is doing inside different applications on their computer. This source code quickly becomes complex and brittle. Furthermore, this approach does not provide a holistic view of how the user is gleaning information from multiple applications at once. We propose an alternative approach to instrumentation based on automated analysis of desktop screenshots, and demonstrate this in the context of extraction of 'claims' from reports that users are writing, and association of these claims with 'evidence' obtained from web browsing. We evaluate our approach on a corpus of 121,000 screenshots obtained from a study of 150 participants carrying out a controlled analysis task. The topic of the task was previously unfamiliar to them (hence the need to search for evidence on the web). We report results from several variants of our approach using a human evaluation of extracted claim/evidence pairs, and find that a simple word matching metric (based on Jaccard similarity) can outperform more complex sentence similarity metrics. We also describe many of the difficulties inherent to screenshot analysis and our approaches to overcome them.}, journal={2017 THIRD IEEE INTERNATIONAL CONFERENCE ON BIG DATA COMPUTING SERVICE AND APPLICATIONS (IEEE BIGDATASERVICE 2017)}, author={Jones, Paul and Medd, Dakota and Ramakrishnan, Sreekanth and Shah, Rajat and Keyton, Joann and Samatova, Nagiza}, year={2017}, pages={17–22} } @article{zhang_tang_harenberg_byna_zou_devendran_martin_wu_dong_klasky_et al._2016, title={AMRZone: A Runtime AMR Data Sharing Framework For Scientific Applications}, ISSN={["2376-4414"]}, DOI={10.1109/ccgrid.2016.62}, abstractNote={Frameworks that facilitate runtime data sharingacross multiple applications are of great importance for scientificdata analytics. Although existing frameworks work well overuniform mesh data, they can not effectively handle adaptive meshrefinement (AMR) data. Among the challenges to construct anAMR-capable framework include: (1) designing an architecturethat facilitates online AMR data management, (2) achievinga load-balanced AMR data distribution for the data stagingspace at runtime, and (3) building an effective online indexto support the unique spatial data retrieval requirements forAMR data. Towards addressing these challenges to supportruntime AMR data sharing across scientific applications, wepresent the AMRZone framework. Experiments over real-worldAMR datasets demonstrate AMRZone's effectiveness at achievinga balanced workload distribution, reading/writing large-scaledatasets with thousands of parallel processes, and satisfyingqueries with spatial constraints. Moreover, AMRZone's performance and scalability are even comparable with existing state-of-the-art work when tested over uniform mesh data with up to16384 cores, in the best case, our framework achieves a 46% performance improvement.}, journal={2016 16TH IEEE/ACM INTERNATIONAL SYMPOSIUM ON CLUSTER, CLOUD AND GRID COMPUTING (CCGRID)}, author={Zhang, Wenzhao and Tang, Houjun and Harenberg, Steve and Byna, Surendra and Zou, Xiaocheng and Devendran, Dharshi and Martin, Daniel F. and Wu, Kesheng and Dong, Bin and Klasky, Scott and et al.}, year={2016}, pages={116–125} } @inbook{chaudhary_gonzalez_bello_angus_desai_harenberg_doraiswamy_semazzi_kumar_samatova_2016, title={Causality-Guided Feature Selection}, ISBN={9783319495859 9783319495866}, ISSN={0302-9743 1611-3349}, url={http://dx.doi.org/10.1007/978-3-319-49586-6_26}, DOI={10.1007/978-3-319-49586-6_26}, abstractNote={Identifying meaningful features that drive a phenomenon (response) of interest in complex systems of interconnected factors is a challenging problem. Causal discovery methods have been previously applied to estimate bounds on causal strengths of factors on a response or to identify meaningful interactions between factors in complex systems, but these approaches have been used only for inferential purposes. In contrast, we posit that interactions between factors with a potential causal association on a given response could be viable candidates not only for hypothesis generation but also for predictive modeling. In this work, we propose a causality-guided feature selection methodology that identifies factors having a potential cause-effect relationship in complex systems, and selects features by clustering them based on their causal strength with respect to the response. To this end, we estimate statistically significant causal effects on the response of factors taking part in potential causal relationships, while addressing associated technical challenges, such as multicollinearity in the data. We validate the proposed methodology for predicting response in five real-world datasets from the domain of climate science and biology. The selected features show predictive skill and consistent performance across different domains.}, booktitle={Advanced Data Mining and Applications}, publisher={Springer International Publishing}, author={Chaudhary, Mandar S. and Gonzalez, Doel L. and Bello, Gonzalo A. and Angus, Michael P. and Desai, Dhara and Harenberg, Steve and Doraiswamy, P. Murali and Semazzi, Fredrick H. M. and Kumar, Vipin and Samatova, Nagiza F.}, year={2016}, pages={391–405} } @inbook{bello_harenberg_agrawal_samatova_2016, title={Community Detection in Dynamic Attributed Graphs}, ISBN={9783319495859 9783319495866}, ISSN={0302-9743 1611-3349}, url={http://dx.doi.org/10.1007/978-3-319-49586-6_22}, DOI={10.1007/978-3-319-49586-6_22}, abstractNote={Community detection is one of the most widely studied tasks in network analysis because community structures are ubiquitous across real-world networks. These real-world networks are often both attributed and dynamic in nature. In this paper, we propose a community detection algorithm for dynamic attributed graphs that, unlike existing community detection methods, incorporates both temporal and attribute information along with the structural properties of the graph. Our proposed algorithm handles graphs with heterogeneous attribute types, as well as changes to both the structure and the attribute information, which is essential for its applicability to real-world networks. We evaluated our proposed algorithm on a variety of synthetically generated benchmark dynamic attributed graphs, as well as on large-scale real-world networks. The results obtained show that our proposed algorithm is able to identify graph partitions of high modularity and high attribute similarity more efficiently than state-of-the-art methods for community detection.}, booktitle={Advanced Data Mining and Applications}, publisher={Springer International Publishing}, author={Bello, Gonzalo A. and Harenberg, Steve and Agrawal, Abhishek and Samatova, Nagiza F.}, year={2016}, pages={329–344} } @inproceedings{zhang_tang_ranshous_byna_martin_wu_dong_klasky_samatova_2016, title={Exploring memory hierarchy and network topology for runtime AMR data sharing across scientific applications}, DOI={10.1109/bigdata.2016.7840743}, abstractNote={Runtime data sharing across applications is of great importance for avoiding high I/O overhead for scientific data analytics. Sharing data on a staging space running on a set of dedicated compute nodes is faster than writing data to a slow disk-based parallel file system (PFS) and then reading it back for post-processing. Originally, the staging space has been purely based on main memory (DRAM), and thus was several orders of magnitude faster than the PFS approach. However, storing all the data produced by large-scale simulations on DRAM is impractical. Moving data from memory to SSD-based burst buffers is a potential approach to address this issue. However, SSDs are about one order of magnitude slower than DRAM. To optimize data access performance over the staging space, methods such as prefetching data from SSDs according to detected spatial access patterns and distributing data across the network topology have been explored. Although these methods work well for uniform mesh data, which they were designed for, they are not well suited for adaptive mesh refinement (AMR) data. Two mąjor issues must be addressed before constructing such a memory hierarchy and topology-aware runtime AMR data sharing framework: (1) spatial access pattern detection and prefetching for AMR data; (2) AMR data distribution across the network topology at runtime. We propose a framework that addresses these challenges and demonstrate its effectiveness with extensive experiments on AMR data. Our results show the framework's spatial access pattern detection and prefetching methods demonstrate about 26% performance improvement for client analytical processes. Moreover, the framework's topology-aware data placement can improve overall data access performance by up to 18%.}, booktitle={2016 IEEE International Conference on Big Data (Big Data)}, author={Zhang, W. Z. and Tang, H. J. and Ranshous, S. and Byna, S. and Martin, D. F. and Wu, K. S. and Dong, B. and Klasky, S. and Samatova, N. F.}, year={2016}, pages={1359–1366} } @article{tang_byna_harenberg_zhang_zou_martin_dong_devendran_wu_trebotich_et al._2016, title={In situ Storage Layout Optimization for AMR Spatio-temporal Read Accesses}, ISSN={["0190-3918"]}, DOI={10.1109/icpp.2016.53}, abstractNote={Analyses of large simulation data often concentrate on regions in space and in time that contain important information. As simulations adopt Adaptive Mesh Refinement (AMR), the data records from a region of interest could be widely scattered on storage devices and accessing interesting regions results in significantly reduced I/O performance. In this work, we study the organization of block-structured AMR data on storage to improve performance of spatio-temporal data accesses. AMR has a complex hierarchical multi-resolution data structure that does not fit easily with the existing approaches that focus on uniform mesh data. To enable efficient AMR read accesses, we develop an in situ data layout optimization framework. Our framework automatically selects from a set of candidate layouts based on a performance model, and reorganizes the data before writing to storage. We evaluate this framework with three AMR datasets and access patterns derived from scientific applications. Our performance model is able to identify the best layout scheme and yields up to a 3X read performance improvement compared to the original layout. Though it is not possible to turn all read accesses into contiguous reads, we are able to achieve 90% of contiguous read throughput with the optimized layouts on average.}, journal={PROCEEDINGS 45TH INTERNATIONAL CONFERENCE ON PARALLEL PROCESSING - ICPP 2016}, author={Tang, Houjun and Byna, Suren and Harenberg, Steve and Zhang, Wenzhao and Zou, Xiaocheng and Martin, Daniel F. and Dong, Bin and Devendran, Dharshi and Wu, Kesheng and Trebotich, David and et al.}, year={2016}, pages={406–415} } @inbook{harenberg_seay_bello_chirkova_doraiswamy_samatova_2016, title={Knowledge-Guided Maximal Clique Enumeration}, ISBN={9783319495859 9783319495866}, ISSN={0302-9743 1611-3349}, url={http://dx.doi.org/10.1007/978-3-319-49586-6_43}, DOI={10.1007/978-3-319-49586-6_43}, abstractNote={Maximal clique enumeration is a long-standing problem in graph mining and knowledge discovery. Numerous classic algorithms exist for solving this problem. However, these algorithms focus on enumerating all maximal cliques, which may be computationally impractical and much of the output may be irrelevant to the user. To address this issue, we introduce the problem of knowledge-biased clique enumeration, a query-driven formulation that reduces output space, computation time, and memory usage. Moreover, we introduce a dynamic state space indexing strategy for efficiently processing multiple queries over the same graph. This strategy reduces redundant computations by dynamically indexing the constituent state space generated with each query. Experimental results over real-world networks demonstrate this strategy's effectiveness at reducing the cumulative query-response time. Although developed in the context of maximal cliques, our techniques could possibly be generalized to other constraint-based graph enumeration tasks.}, booktitle={Advanced Data Mining and Applications}, publisher={Springer International Publishing}, author={Harenberg, Steve and Seay, Ramona G. and Bello, Gonzalo A. and Chirkova, Rada Y. and Doraiswamy, P. Murali and Samatova, Nagiza F.}, year={2016}, pages={604–618} } @article{tang_byna_harenberg_zou_zhang_wu_dong_rubel_bouchard_klasky_et al._2016, title={Usage Pattern-Driven Dynamic Data Layout Reorganization}, ISSN={["2376-4414"]}, DOI={10.1109/ccgrid.2016.15}, abstractNote={As scientific simulations and experiments move toward extremely large scales and generate massive amounts of data, the data access performance of analytic applications becomes crucial. A mismatch often happens between write and read patterns of data accesses, typically resulting in poor read performance. Data layout reorganization has been used to improve the locality of data accesses. However, current data reorganizations are static and focus on generating a single (or set of) optimized layouts that rely on prior knowledge of exact future access patterns. We propose a framework that dynamically recognizes the data usage patterns, replicates the data of interest in multiple reorganized layouts that would benefit common read patterns, and makes runtime decisions on selecting a favorable layout for a given read pattern. This framework supports reading individual elements and chunks of a multi-dimensional array of variables. Our pattern-driven layout selection strategy achieves multi-fold speedups compared to reading from the original dataset.}, journal={2016 16TH IEEE/ACM INTERNATIONAL SYMPOSIUM ON CLUSTER, CLOUD AND GRID COMPUTING (CCGRID)}, author={Tang, Houjun and Byna, Suren and Harenberg, Steve and Zou, Xiaocheng and Zhang, Wenzhao and Wu, Kesheng and Dong, Bin and Rubel, Oliver and Bouchard, Kristofer and Klasky, Scott and et al.}, year={2016}, pages={356–365} } @article{chen_schmidt_tian_samatova_zhang_2015, title={An efficient algorithm for pairwise local alignment of protein interaction networks}, volume={13}, ISSN={["1757-6334"]}, DOI={10.1142/s0219720015500031}, abstractNote={ Recently, researchers seeking to understand, modify, and create beneficial traits in organisms have looked for evolutionarily conserved patterns of protein interactions. Their conservation likely means that the proteins of these conserved functional modules are important to the trait's expression. In this paper, we formulate the problem of identifying these conserved patterns as a graph optimization problem, and develop a fast heuristic algorithm for this problem. We compare the performance of our network alignment algorithm to that of the MaWISh algorithm [Koyutürk M, Kim Y, Topkara U, Subramaniam S, Szpankowski W, Grama A, Pairwise alignment of protein interaction networks, J Comput Biol13(2):182–199, 2006.], which bases its search algorithm on a related decision problem formulation. We find that our algorithm discovers conserved modules with a larger number of proteins in an order of magnitude less time. The protein sets found by our algorithm correspond to known conserved functional modules at comparable precision and recall rates as those produced by the MaWISh algorithm. }, number={2}, journal={JOURNAL OF BIOINFORMATICS AND COMPUTATIONAL BIOLOGY}, author={Chen, Wenbin and Schmidt, Matthew and Tian, Wenhong and Samatova, Nagiza F. and Zhang, Shaohong}, year={2015}, month={Apr} } @misc{ranshous_shen_koutra_harenberg_faloutsos_samatova_2015, title={Anomaly detection in dynamic networks: a survey}, volume={7}, ISSN={["1939-0068"]}, DOI={10.1002/wics.1347}, abstractNote={Anomaly detection is an important problem with multiple applications, and thus has been studied for decades in various research domains. In the past decade there has been a growing interest in anomaly detection in data represented as networks, or graphs, largely because of their robust expressiveness and their natural ability to represent complex relationships. Originally, techniques focused on anomaly detection in static graphs, which do not change and are capable of representing only a single snapshot of data. As real‐world networks are constantly changing, there has been a shift in focus to dynamic graphs, which evolve over time.In this survey, we aim to provide a comprehensive overview of anomaly detection in dynamic networks, concentrating on the state‐of‐the‐art methods. We first describe four types of anomalies that arise in dynamic networks, providing an intuitive explanation, applications, and a concrete example for each. Having established an idea for what constitutes an anomaly, a general two‐stage approach to anomaly detection in dynamic networks that is common among the methods is presented. We then construct a two‐tiered taxonomy, first partitioning the methods based on the intuition behind their approach, and subsequently subdividing them based on the types of anomalies they detect. Within each of the tier one categories—community, compression, decomposition, distance, and probabilistic model based—we highlight the major similarities and differences, showing the wealth of techniques derived from similar conceptual approaches. WIREs Comput Stat 2015, 7:223–247. doi: 10.1002/wics.1347This article is categorized under: Algorithms and Computational Methods > Algorithms Data: Types and Structure > Graph and Network Data Statistical Learning and Exploratory Methods of the Data Sciences > Pattern Recognition }, number={3}, journal={WILEY INTERDISCIPLINARY REVIEWS-COMPUTATIONAL STATISTICS}, author={Ranshous, Stephen and Shen, Shitian and Koutra, Danai and Harenberg, Steve and Faloutsos, Christos and Samatova, Nagiza F.}, year={2015}, pages={223–247} } @article{zhang_tang_zou_harenberg_liu_klasky_samatova_2015, title={Exploring Memory Hierarchy to Improve Scientific Data Read Performance}, ISSN={["1552-5244"]}, DOI={10.1109/cluster.2015.18}, abstractNote={Improving read performance is one of the major challenges with speeding up scientific data analytic applications. Utilizing the memory hierarchy is one major line of researches to address the read performance bottleneck. Related methods usually combine solide-state-drives(SSDs) with dynamic random-access memory(DRAM) and/or parallel file system(PFS) to mitigate the speed and space gap between DRAM and PFS. However, these methods are unable to handle key performance issues plaguing SSDs, namely read contention that may cause up to 50% performance reduction. In this paper, we propose a framework that exploits the memory hierarchy resource to address the read contention issues involved with SSDs. The framework employs a general purpose online read algorithm that able to detect and utilize memory hierarchy resource to relieve the problem. To maintain a near optimal operating environment for SSDs, the framework is able to orchastrate data chunks across different memory layers to facilitate the read algorithm. Compared to existing tools, our framework achieves up to 50% read performance improvement when tested on datasets from real-world scientific simulations.}, journal={2015 IEEE INTERNATIONAL CONFERENCE ON CLUSTER COMPUTING - CLUSTER 2015}, author={Zhang, Wenzhao and Tang, Houjun and Zou, Xiaocheng and Harenberg, Steven and Liu, Qing and Klasky, Scott and Samatova, Nagiza F.}, year={2015}, pages={66–69} } @article{chen_samatova_stallmann_hendrix_ying_2016, title={On size-constrained minimum s-t cut problems and size-constrained dense subgraph problems}, volume={609}, ISSN={["1879-2294"]}, DOI={10.1016/j.tcs.2015.10.031}, abstractNote={In some application cases, the solutions of combinatorial optimization problems on graphs should satisfy an additional vertex size constraint. In this paper, we consider size-constrained minimum s–t cut problems and size-constrained dense subgraph problems. We introduce the minimum s–t cut with at-least-k vertices problem, the minimum s–t cut with at-most-k vertices problem, and the minimum s–t cut with exactly k vertices problem. We prove that they are NP-complete. Thus, they are not polynomially solvable unless P=NP. On the other hand, we also study the densest at-least-k-subgraph problem (DalkS) and the densest at-most-k-subgraph problem (DamkS) introduced by Andersen and Chellapilla [1]. We present a polynomial time algorithm for DalkS when k is bounded by some constant c. We also present two approximation algorithms for DamkS. The first approximation algorithm for DamkS has an approximation ratio of n−1k−1, where n is the number of vertices in the input graph. The second approximation algorithm for DamkS has an approximation ratio of O(nδ), for some δ<1/3.}, journal={THEORETICAL COMPUTER SCIENCE}, author={Chen, Wenbin and Samatova, Nagiza F. and Stallmann, Matthias F. and Hendrix, William and Ying, Weiqin}, year={2016}, month={Jan}, pages={434–442} } @article{gonzalez_angus_tetteh_bello_padmanabhan_pendse_srinivas_yu_semazzi_kumar_et al._2015, title={On the data-driven inference of modulatory networks in climate science: an application to West African rainfall}, volume={22}, ISSN={["1607-7946"]}, DOI={10.5194/npg-22-33-2015}, abstractNote={Abstract. Decades of hypothesis-driven and/or first-principles research have been applied towards the discovery and explanation of the mechanisms that drive climate phenomena, such as western African Sahel summer rainfall~variability. Although connections between various climate factors have been theorized, not all of the key relationships are fully understood. We propose a data-driven approach to identify candidate players in this climate system, which can help explain underlying mechanisms and/or even suggest new relationships, to facilitate building a more comprehensive and predictive model of the modulatory relationships influencing a climate phenomenon of interest. We applied coupled heterogeneous association rule mining (CHARM), Lasso multivariate regression, and dynamic Bayesian networks to find relationships within a complex system, and explored means with which to obtain a consensus result from the application of such varied methodologies. Using this fusion of approaches, we identified relationships among climate factors that modulate Sahel rainfall. These relationships fall into two categories: well-known associations from prior climate knowledge, such as the relationship with the El Niño–Southern Oscillation (ENSO) and putative links, such as North Atlantic Oscillation, that invite further research. }, number={1}, journal={NONLINEAR PROCESSES IN GEOPHYSICS}, author={Gonzalez, D. L., II and Angus, M. P. and Tetteh, I. K. and Bello, G. A. and Padmanabhan, K. and Pendse, S. V. and Srinivas, S. and Yu, J. and Semazzi, F. and Kumar, V. and et al.}, year={2015}, pages={33–46} } @article{zou_wu_boyuka_martin_byna_tang_bansal_ligocki_johansen_samatova_2015, title={Parallel In Situ Detection of Connected Components in Adaptive Mesh Refinement Data}, ISSN={["2376-4414"]}, DOI={10.1109/ccgrid.2015.154}, abstractNote={Adaptive Mesh Refinement (AMR) represents a significant advance for scientific simulation codes, greatly reducing memory and compute requirements by dynamically varying simulation resolution over space and time. As simulation codes transition to AMR, existing analysis algorithms must also make this transition. One such algorithm, connected component detection, is of vital importance in many simulation and analysis contexts, with some simulation codes even relying on parallel, in situ connected component detection for correctness. Yet, current detection algorithms designed for uniform meshes are not applicable to hierarchical, non-uniform AMR, and to the best of our knowledge, AMR connected component detection has not been explored in the literature. Therefore, in this paper, we formally define the general problem of connected component detection for AMR, and present a general solution. Beyond solving the general detection problem, achieving viable in situ detection performance is even more challenging. The core issue is the conflict between the communication-intensive nature of connected component detection (in general, and especially for AMR data) and the requirement that in situ processes incur minimal performance impact on the co-located simulation. We address this challenge by presenting the first connected component detection methodology for structured AMR that is applicable in a parallel, in situ context. Our key strategy is the incorporation of an multi-phase AMR-aware communication pattern that synchronizes connectivity information across the AMR hierarchy. In addition, we distil our methodology to a generic framework within the Combo AMR infrastructure, making connected component detection services available for many existing applications. We demonstrate our method's efficacy by showing its ability to detect ice calving events in real time within the real-world BISICLES ice sheet modelling code. Results show up to a 6.8x speedup of our algorithm over the existing specialized BISICLES algorithm. We also show scalability results for our method up to 4,096 cores using a parallel Combo-based benchmark.}, journal={2015 15TH IEEE/ACM INTERNATIONAL SYMPOSIUM ON CLUSTER, CLOUD AND GRID COMPUTING}, author={Zou, Xiaocheng and Wu, Kesheng and Boyuka, David A. and Martin, Daniel F. and Byna, Suren and Tang, Houjun and Bansal, Kushal and Ligocki, Terry J. and Johansen, Hans and Samatova, Nagiza F.}, year={2015}, pages={302–312} } @inbook{bello_angus_pedemane_harlalka_semazzi_kumar_samatova_2015, title={Response-Guided Community Detection: Application to Climate Index Discovery}, ISBN={9783319235240 9783319235257}, ISSN={0302-9743 1611-3349}, url={http://dx.doi.org/10.1007/978-3-319-23525-7_45}, DOI={10.1007/978-3-319-23525-7_45}, abstractNote={Discovering climate indices–time series that summarize spatiotemporal climate patterns–is a key task in the climate science domain. In this work, we approach this task as a problem of response-guided community detection; that is, identifying communities in a graph associated with a response variable of interest. To this end, we propose a general strategy for response-guided community detection that explicitly incorporates information of the response variable during the community detection process, and introduce a graph representation of spatiotemporal data that leverages information from multiple variables. We apply our proposed methodology to the discovery of climate indices associated with seasonal rainfall variability. Our results suggest that our methodology is able to capture the underlying patterns known to be associated with the response variable of interest and to improve its predictability compared to existing methodologies for data-driven climate index discovery and official forecasts.}, booktitle={Machine Learning and Knowledge Discovery in Databases}, publisher={Springer International Publishing}, author={Bello, Gonzalo A. and Angus, Michael and Pedemane, Navya and Harlalka, Jitendra K. and Semazzi, Fredrick H. M. and Kumar, Vipin and Samatova, Nagiza F.}, year={2015}, pages={736–751} } @article{boyuka_tang_bansal_zou_klasky_samatova_2015, title={The Hyperdyadic Index and Generalized Indexing and Query with PIQUE}, DOI={10.1145/2791347.2791374}, abstractNote={Many scientists rely on indexing and query to identify trends and anomalies within extreme-scale scientific data. Compressed bitmap indexing (e.g., FastBit) is the go-to indexing method for many scientific datasets and query workloads. Recently, the ALACRITY compressed inverted index was shown as a viable alternative approach. Notably, though FastBit and ALACRITY employ very different data structures (inverted list vs. bitmap) and binning methods (bit-wise vs. decimal-precision), close examination reveals marked similarities in index structure. Motivated by this observation, we ask two questions. First, "Can we generalize FastBit and ALACRITY to an index model encompassing both?" And second, if so, "Can such a generalized framework enable other, new indexing methods?" This paper answers both questions in the affrmative. First, we present PIQUE, a Parallel Indexing and Query Unified Engine, based on formal mathematical decomposition of the indexing process. PIQUE factors out commonalities in indexing, employing algorithmic/data structure "plugins" to mix orthogonal indexing concepts such as FastBit compressed bitmaps with ALACRITY binning, all within one framework. Second, we define the hyperdyadic tree index, distinct from both bitmap and inverted indexes, demonstrating good index compression while maintaining high query performance. We implement the hyperdyadic tree index within PIQUE, reinforcing our unified indexing model. We conduct a performance study of the hyperdyadic tree index vs. WAH compressed bitmaps, both within PIQUE and compared to FastBit, a state-of-the-art bitmap index system. The hyperdyadic tree index shows a 1.14-1.90x storage reduction vs. compressed bitmaps, with comparable or better query performance under most scenarios tested.}, journal={PROCEEDINGS OF THE 27TH INTERNATIONAL CONFERENCE ON SCIENTIFIC AND STATISTICAL DATABASE MANAGEMENT}, author={Boyuka, David A., II and Tang, Houjun and Bansal, Kushal and Zou, Xiaocheng and Klasky, Scott and Samatova, Nagiza F.}, year={2015} } @article{harenberg_bello_gjeltema_ranshous_harlalka_seay_padmanabhan_samatova_2014, title={Community detection in large-scale networks: a survey and empirical evaluation}, volume={6}, ISSN={1939-5108}, url={http://dx.doi.org/10.1002/WICS.1319}, DOI={10.1002/WICS.1319}, abstractNote={Community detection is a common problem in graph data analytics that consists of finding groups of densely connected nodes with few connections to nodes outside of the group. In particular, identifying communities in large‐scale networks is an important task in many scientific domains. In this review, we evaluated eight state‐of‐the‐art and five traditional algorithms for overlapping and disjoint community detection on large‐scale real‐world networks with known ground‐truth communities. These 13 algorithms were empirically compared using goodness metrics that measure the structural properties of the identified communities, as well as performance metrics that evaluate these communities against the ground‐truth. Our results show that these two types of metrics are not equivalent. That is, an algorithm may perform well in terms of goodness metrics, but poorly in terms of performance metrics, or vice versa. WIREs Comput Stat 2014, 6:426–439. doi: 10.1002/wics.1319This article is categorized under: Algorithms and Computational Methods > Algorithms Statistical Learning and Exploratory Methods of the Data Sciences > Clustering and Classification Data: Types and Structure > Graph and Network Data }, number={6}, journal={Wiley Interdisciplinary Reviews: Computational Statistics}, publisher={Wiley}, author={Harenberg, Steve and Bello, Gonzalo and Gjeltema, L. and Ranshous, Stephen and Harlalka, Jitendra and Seay, Ramona and Padmanabhan, Kanchana and Samatova, Nagiza}, year={2014}, month={Jul}, pages={426–439} } @article{lakshminarasimhan_zou_boyuka_pendse_jenkins_vishwanath_papka_klasky_samatova_2014, title={DIRAQ: scalable in situ data- and resource-aware indexing for optimized query performance}, volume={17}, ISSN={["1573-7543"]}, DOI={10.1007/s10586-014-0358-z}, number={4}, journal={CLUSTER COMPUTING-THE JOURNAL OF NETWORKS SOFTWARE TOOLS AND APPLICATIONS}, author={Lakshminarasimhan, Sriram and Zou, Xiaocheng and Boyuka, David A., II and Pendse, Saurabh V. and Jenkins, John and Vishwanath, Venkatram and Papka, Michael E. and Klasky, Scott and Samatova, Nagiza F.}, year={2014}, month={Dec}, pages={1101–1119} } @article{liess_kumar_snyder_kawale_steinhaeuser_semazzi_ganguly_samatova_kumar_2014, title={Different Modes of Variability over the Tasman Sea: Implications for Regional Climate}, volume={27}, ISSN={["1520-0442"]}, DOI={10.1175/jcli-d-13-00713.1}, abstractNote={Abstract A new approach is used to detect atmospheric teleconnections without being bound by orthogonality (such as empirical orthogonal functions). This method employs negative correlations in a global dataset to detect potential teleconnections. One teleconnection occurs between the Tasman Sea and the Southern Ocean. It is related to El Niño–Southern Oscillation (ENSO), the Indian Ocean dipole (IOD), and the southern annular mode (SAM). This teleconnection is significantly correlated with SAM during austral summer, fall, and winter, with IOD during spring, and with ENSO in summer. It can thus be described as a hybrid between these modes. Given previously found relationships between IOD and ENSO, and IOD’s proximity to the teleconnection centers, correlations to IOD are generally stronger than to ENSO. Increasing pressure over the Tasman Sea leads to higher (lower) surface temperature over eastern Australia (the southwestern Pacific) in all seasons and is related to reduced surface temperature over Wilkes Land and Adélie Land in Antarctica during fall and winter. Precipitation responses are generally negative over New Zealand. For one standard deviation of the teleconnection index, precipitation anomalies are positive over Australia in fall, negative over southern Australia in winter and spring, and negative over eastern Australia in summer. When doubling the threshold, the size of the anomalous high-pressure center increases and annual precipitation anomalies are negative over southeastern Australia and northern New Zealand. Eliassen–Palm fluxes quantify the seasonal dependence of SAM, ENSO, and IOD influences. Analysis of the dynamical interactions between these teleconnection patterns can improve prediction of seasonal temperature and precipitation patterns in Australia and New Zealand.}, number={22}, journal={JOURNAL OF CLIMATE}, author={Liess, Stefan and Kumar, Arjun and Snyder, Peter K. and Kawale, Jaya and Steinhaeuser, Karsten and Semazzi, Frederick H. M. and Ganguly, Auroop R. and Samatova, Nagiza F. and Kumar, Vipin}, year={2014}, month={Nov}, pages={8466–8486} } @inbook{zou_lakshminarasimhan_boyuka_ranshous_tang_klasky_samatova_2014, title={Fast Set Intersection through Run-Time Bitmap Construction over PForDelta-Compressed Indexes}, ISBN={9783319098722 9783319098739}, ISSN={0302-9743 1611-3349}, url={http://dx.doi.org/10.1007/978-3-319-09873-9_56}, DOI={10.1007/978-3-319-09873-9_56}, abstractNote={Set intersection is a fundamental operation for evaluating conjunctive queries in the context of scientific data analysis. The state-of-the-art approach in performing set intersection, compressed bitmap indexing, achieves high computational efficiency because of cheap bitwise operations; however, overall efficiency is often nullified by the HPC I/O bottleneck, because compressed bitmap indexes typically exhibit a heavy storage footprint. Conversely, the recently-presented PForDelta-compressed index has been demonstrated to be storage-lightweight, but has limited performance for set intersection. Thus, a more effective set intersection approach should be efficient in both computation and I/O. Therefore, we propose a fast set intersection approach that couples the storage light-weight PForDelta indexing format with computationally-efficient bitmaps through a specialized on-the-fly conversion. The resultant challenge is to ensure this conversion process is fast enough to maintain the performance gains from both PForDelta and the bitmaps. To this end, we contribute two key enhancements to PForDelta, BitRun and BitExp, which improve bitmap conversion through bulk bit-setting and a more streamlined PForDelta decoding process, respectively. Our experimental results show that our integrated PForDelta-bitmap method speeds up conjunctive queries by up to 7.7x versus the state-of-the-art approach, while using indexes that require 15%-60% less storage in most cases.}, booktitle={Lecture Notes in Computer Science}, publisher={Springer International Publishing}, author={Zou, Xiaocheng and Lakshminarasimhan, Sriram and Boyuka, David A., II and Ranshous, Stephen and Tang, Houjun and Klasky, Scott and Samatova, Nagiza F.}, year={2014}, pages={668–679} } @inbook{tang_zou_jenkins_boyuka_ranshous_kimpe_klasky_samatova_2014, title={Improving Read Performance with Online Access Pattern Analysis and Prefetching}, ISBN={9783319098722 9783319098739}, ISSN={0302-9743 1611-3349}, url={http://dx.doi.org/10.1007/978-3-319-09873-9_21}, DOI={10.1007/978-3-319-09873-9_21}, abstractNote={Among the major challenges of transitioning to exascale in HPC is the ubiquitous I/O bottleneck. For analysis and visualization applications in particular, this bottleneck is exacerbated by the write-onceread- many property of most scientific datasets combined with typically complex access patterns. One promising way to alleviate this problem is to recognize the application's access patterns and utilize them to prefetch data, thereby overlapping computation and I/O. However, current research methods for analyzing access patterns are either offline-only and/or lack the support for complex access patterns, such as high-dimensional strided or composition-based unstructured access patterns. Therefore, we propose an online analyzer capable of detecting both simple and complex access patterns with low computational and memory overhead and high accuracy. By combining our pattern detection with prefetching,we consistently observe run-time reductions, up to 26%, across 18 configurations of PIOBench and 4 configurations of a micro-benchmark with both structured and unstructured access patterns.}, booktitle={Lecture Notes in Computer Science}, publisher={Springer International Publishing}, author={Tang, Houjun and Zou, Xiaocheng and Jenkins, John and Boyuka, David A., II and Ranshous, Stephen and Kimpe, Dries and Klasky, Scott and Samatova, Nagiza F.}, year={2014}, pages={246–257} } @article{jenkins_dinan_balaji_peterka_samatova_thakur_2014, title={Processing MPI Derived Datatypes on Noncontiguous GPU-Resident Data}, volume={25}, ISSN={["1558-2183"]}, DOI={10.1109/tpds.2013.234}, abstractNote={Driven by the goals of efficient and generic communication of noncontiguous data layouts in GPU memory, for which solutions do not currently exist, we present a parallel, noncontiguous data-processing methodology through the MPI datatypes specification. Our processing algorithm utilizes a kernel on the GPU to pack arbitrary noncontiguous GPU data by enriching the datatypes encoding to expose a fine-grained, data-point level of parallelism. Additionally, the typically tree-based datatype encoding is preprocessed to enable efficient, cached access across GPU threads. Using CUDA, we show that the computational method outperforms DMA-based alternatives for several common data layouts as well as more complex data layouts for which reasonable DMA-based processing does not exist. Our method incurs low overhead for data layouts that closely match best-case DMA usage or that can be processed by layout-specific implementations. We additionally investigate usage scenarios for data packing that incur resource contention, identifying potential pitfalls for various packing strategies. We also demonstrate the efficacy of kernel-based packing in various communication scenarios, showing multifold improvement in point-to-point communication and evaluating packing within the context of the SHOC stencil benchmark and HACC mesh analysis.}, number={10}, journal={IEEE TRANSACTIONS ON PARALLEL AND DISTRIBUTED SYSTEMS}, author={Jenkins, John and Dinan, James and Balaji, Pavan and Peterka, Tom and Samatova, Nagiza F. and Thakur, Rajeev}, year={2014}, month={Oct}, pages={2627–2637} } @inbook{jenkins_zou_tang_kimpe_ross_samatova_2014, title={RADAR: Runtime Asymmetric Data-Access Driven Scientific Data Replication}, ISBN={9783319075174 9783319075181}, ISSN={0302-9743 1611-3349}, url={http://dx.doi.org/10.1007/978-3-319-07518-1_19}, DOI={10.1007/978-3-319-07518-1_19}, abstractNote={Efficient I/O on large-scale spatiotemporal scientific data requires scrutiny of both the logical layout of the data (e.g., row-major vs. column-major) and the physical layout (e.g., distribution on parallel filesystems). For increasingly complex datasets, hand optimization is a difficult matter prone to error and not scalable to the increasing heterogeneity of analysis workloads. Given these factors, we present a partial data replication system called RADAR. We capture datatype- and collective-aware I/O access patterns (indicating logical access) via MPI-IO tracing and use a combination of coarse-grained and fine-grained performance modeling to evaluate and select optimized physical data distributions for the task at hand. Unlike conventional methods, we store all replica data and metadata, along with the original untouched data, under a single file container using the object abstraction in parallel filesystems. Our system results in manyfold improvements in some commonly used subvolume decomposition access patterns.Moreover, the modeling approach can determine whether such optimizations should be undertaken in the first place.}, booktitle={Lecture Notes in Computer Science}, publisher={Springer International Publishing}, author={Jenkins, John and Zou, Xiaocheng and Tang, Houjun and Kimpe, Dries and Ross, Robert and Samatova, Nagiza F.}, year={2014}, pages={296–313} } @article{chen_chen_samatova_peng_wang_tang_2014, title={Solving the maximum duo-preservation string mapping problem with linear programming}, volume={530}, ISSN={["1879-2294"]}, DOI={10.1016/j.tcs.2014.02.017}, abstractNote={In this paper, we introduce the maximum duo-preservation string mapping problem (MPSM), which is complementary to the minimum common string partition problem (MCSP). When each letter occurs at most k times in any input string, the version of MPSM is called k-MPSM. In order to design approximation algorithms for MPSM, we also introduce the constrained maximum induced subgraph problem (CMIS) and the constrained minimum induced subgraph (CNIS) problem. We show that both CMIS and CNIS are NP-complete. We also study the approximation algorithms for the restricted version of CMIS, which is called k-CMIS (k⩾2). Using Linear Programming method, we propose an approximation algorithm for 2-CMIS with approximation ratio 2 and an approximation algorithm for k-CMIS (k⩾3) with approximation ratio k2. Based on approximation algorithms for k-CMIS, we get approximation algorithms for k-MPSM with the same approximation ratio.}, journal={THEORETICAL COMPUTER SCIENCE}, author={Chen, Wenbin and Chen, Zhengzhang and Samatova, Nagiza F. and Peng, Lingxi and Wang, Jianxiong and Tang, Maobin}, year={2014}, month={Apr}, pages={1–11} } @article{faghmous_banerjee_shekhar_steinbach_kumar_ganguly_samatova_2014, title={Theory-Guided Data Science for Climate Change}, volume={47}, ISSN={["1558-0814"]}, DOI={10.1109/mc.2014.335}, abstractNote={To adequately address climate change, we need novel data-science methods that account for the spatiotemporal and physical nature of climate phenomena. Only then will we be able to move from statistical analysis to scientific insights.}, number={11}, journal={COMPUTER}, author={Faghmous, James H. and Banerjee, Arindam and Shekhar, Shashi and Steinbach, Michael and Kumar, Vipin and Ganguly, Auroop R. and Samatova, Nagiza}, year={2014}, month={Nov}, pages={74–78} } @article{boyuka_lakshminarasimhan_zou_gong_jenkins_schendel_podhorszki_liu_klasky_samatova_2014, title={Transparent In Situ Data Transformations in ADIOS}, ISSN={["2376-4414"]}, DOI={10.1109/ccgrid.2014.73}, abstractNote={Though an abundance of novel "data transformation" technologies have been developed (such as compression, level-of-detail, layout optimization, and indexing), there remains a notable gap in the adoption of such services by scientific applications. In response, we develop an in situ data transformation framework in the ADIOS I/O middleware with a "plug in" interface, thus greatly simplifying both the deployment and use of data transform services in scientific applications. Our approach ensures user-transparency, runtime-configurability, compatibility with existing I/O optimizations, and the potential for exploiting read-optimizing transforms (such as level-of-detail) to achieve I/O reduction. We demonstrate use of our framework with the QLG simulation at up to 8,192 cores on the leadership-class Titan supercomputer, showing negligible overhead. We also explore the read performance implications of data transforms with respect to parameters such as chunk size, access pattern, and the "opacity" of different transform methods including compression and level-of-detail.}, journal={2014 14TH IEEE/ACM INTERNATIONAL SYMPOSIUM ON CLUSTER, CLOUD AND GRID COMPUTING (CCGRID)}, author={Boyuka, David A., II and Lakshminarasimhan, Sriram and Zou, Xiaocheng and Gong, Zhenhuan and Jenkins, John and Schendel, Eric R. and Podhorszki, Norbert and Liu, Qing and Klasky, Scott and Samatova, Nagiza F.}, year={2014}, pages={256–266} } @inproceedings{schendel_harenberg_tang_vishwanath_papka_samatova_2013, title={A generic high-performance method for deinterleaving scientific data}, volume={8097}, DOI={10.1007/978-3-642-40047-6_58}, abstractNote={High-performance and energy-efficient data management applications are a necessity for HPC systems due to the extreme scale of data produced by high fidelity scientific simulations that these systems support. Data layout in memory hugely impacts the performance. For better performance, most simulations interleave variables in memory during their calculation phase, but deinterleave the data for subsequent storage and analysis. As a result, efficient data deinterleaving is critical; yet, common deinterleaving methods provide inefficient throughput and energy performance. To address this problem, we propose a deinterleaving method that is high performance, energy efficient, and generic to any data type. To the best of our knowledge, this is the first deinterleaving method that 1) exploits data cache prefetching, 2) reduces memory accesses, and 3) optimizes the use of complete cache line writes. When evaluated against conventional deinterleaving methods on 105 STREAM standard micro-benchmarks, our method always improved throughput and throughput/watt on multi-core systems. In the best case, our deinterleaving method improved throughput up to 26.2x and throughput/watt up to 7.8x.}, booktitle={Euro-par 2013 parallel processing}, author={Schendel, E. R. and Harenberg, S. and Tang, H. J. and Vishwanath, V. and Papka, M. E. and Samatova, N. F.}, year={2013}, pages={571–582} } @article{kawale_liess_kumar_steinbach_snyder_kumar_ganguly_samatova_semazzi_2013, title={A graph-based approach to find teleconnections in climate data}, volume={6}, ISSN={1932-1864}, url={http://dx.doi.org/10.1002/SAM.11181}, DOI={10.1002/SAM.11181}, abstractNote={AbstractPressure dipoles are important long distance climate phenomena (teleconnection) characterized by pressure anomalies of the opposite polarity appearing at two different locations at the same time. Such dipoles have been proven important for understanding and explaining the variability in climate in many regions of the world, e.g. the El Niño Southern Oscillation (ENSO) climate phenomenon, which is described by opposite pressure anomalies between the west and east Pacific and is known to be responsible for precipitation and temperature anomalies worldwide. This paper presents a graph‐based approach called shared reciprocal nearest neighbor approach that considers only reciprocal positive and negative edges in the shared nearest neighbor graph to find the dipoles. One crucial aspect of our approach to the analysis of such networks is a careful treatment of negative correlations, whose proper consideration is critical for finding the dipoles. Further, our work shows the importance of modeling the time‐dependent patterns of the dipoles in a changing climate in order to better capture the impact of important climate phenomena on the globe. To show the utility of finding dipoles using our approach, we show that the data driven dynamic climate indices generated from our algorithm generally perform better than static indices formed from the fixed locations used by climate scientists in terms of capturing impact on global temperature and precipitation. Our approach can generate a single snapshot picture of all the dipole interconnections on the globe in a given dataset and thus makes it possible to study the changes in dipole interactions and movements. As teleconnections are crucial in the understanding of the global climate system, there is a pressing need to better understand the behavior and interactions of these atmospheric processes as well as to capture them precisely. Our systematic graph‐based approach to find the teleconnections in climate data is an attempt in that direction. © 2012 Wiley Periodicals, Inc. Statistical Analysis and Data Mining 6: 158–179, 2013}, number={3}, journal={Statistical Analysis and Data Mining}, publisher={Wiley}, author={Kawale, Jaya and Liess, Stefan and Kumar, Arjun and Steinbach, Michael and Snyder, Peter and Kumar, Vipin and Ganguly, Auroop R. and Samatova, Nagiza F. and Semazzi, Fredrick}, year={2013}, month={Apr}, pages={158–179} } @inbook{jenkins_arkatkar_lakshminarasimhan_boyuka_schendel_shah_ethier_chang_chen_kolla_et al._2013, title={ALACRITY: Analytics-Driven Lossless Data Compression for Rapid In-Situ Indexing, Storing, and Querying}, ISBN={9783642412202 9783642412219}, ISSN={0302-9743 1611-3349}, url={http://dx.doi.org/10.1007/978-3-642-41221-9_4}, DOI={10.1007/978-3-642-41221-9_4}, abstractNote={High-performance computing architectures face nontrivial data processing challenges, as computational and I/O components further diverge in performance trajectories. For scientific data analysis in particular, methods based on generating heavyweight access acceleration structures, e.g. indexes, are becoming less feasible for ever-increasing dataset sizes. We present ALACRITY, demonstrating the effectiveness of a fused data and index encoding of scientific, floating-point data in generating lightweight data structures amenable to common types of queries used in scientific data analysis. We exploit the representation of floating-point values by extracting significant bytes, using the resulting unique values to bin the remaining data along fixed-precision boundaries. To optimize query processing, we use an inverted index, mapping each generated bin to a list of records contained within, allowing us to optimize query processing with attribute range constraints. Overall, the storage footprint for both index and data is shown to be below numerous configurations of bitmap indexing, while matching or outperforming query performance.}, booktitle={Transactions on Large-Scale Data- and Knowledge-Centered Systems X}, publisher={Springer Berlin Heidelberg}, author={Jenkins, John and Arkatkar, Isha and Lakshminarasimhan, Sriram and Boyuka, David A., II and Schendel, Eric R. and Shah, Neil and Ethier, Stephane and Chang, Choong-Seock and Chen, Jackie and Kolla, Hemanth and et al.}, year={2013}, pages={95–114} } @article{atluri_padmanabhan_fang_steinbach_petrella_lim_macdonald_samatova_doraiswamy_kumar_2013, title={Complex biomarker discovery in neuroimaging data: Finding a needle in a haystack}, volume={3}, ISSN={2213-1582}, url={http://dx.doi.org/10.1016/J.NICL.2013.07.004}, DOI={10.1016/J.NICL.2013.07.004}, abstractNote={Neuropsychiatric disorders such as schizophrenia, bipolar disorder and Alzheimer's disease are major public health problems. However, despite decades of research, we currently have no validated prognostic or diagnostic tests that can be applied at an individual patient level. Many neuropsychiatric diseases are due to a combination of alterations that occur in a human brain rather than the result of localized lesions. While there is hope that newer imaging technologies such as functional and anatomic connectivity MRI or molecular imaging may offer breakthroughs, the single biomarkers that are discovered using these datasets are limited by their inability to capture the heterogeneity and complexity of most multifactorial brain disorders. Recently, complex biomarkers have been explored to address this limitation using neuroimaging data. In this manuscript we consider the nature of complex biomarkers being investigated in the recent literature and present techniques to find such biomarkers that have been developed in related areas of data mining, statistics, machine learning and bioinformatics.}, journal={NeuroImage: Clinical}, publisher={Elsevier BV}, author={Atluri, Gowtham and Padmanabhan, Kanchana and Fang, Gang and Steinbach, Michael and Petrella, Jeffrey R. and Lim, Kelvin and MacDonald, Angus, III and Samatova, Nagiza F. and Doraiswamy, P. Murali and Kumar, Vipin}, year={2013}, pages={123–131} } @article{gonzalez_pendse_padmanabhan_angus_tetteh_srinivas_villanes_semazzi_kumar_samatova_2013, title={Coupled Heterogeneous Association Rule Mining (CHARM): Application toward Inference of Modulatory Climate Relationships}, ISSN={["1550-4786"]}, DOI={10.1109/icdm.2013.142}, abstractNote={The complex dynamic climate system often exhibits hierarchical modularity of its organization and function. Scientists have spent decades trying to discover and understand the driving mechanisms behind western African Sahel summer rainfall variability, mostly via hypothesis-driven and/or first-principles based research. Their work has furthered theory regarding the connections between various climate patterns, but the key relationships are still not fully understood. We present Coupled Heterogeneous Association Rule Mining (CHARM), a computationally efficient methodology that mines higher-order relationships between these subsystems' anomalous temporal phases with respect to their effect on the system's response. We apply this to climate science data, aiming to infer putative pathways/cascades of modulating events and the modulating signs that collectively define the network of pathways for the rainfall anomaly in the Sahel. Experimental results are consistent with fundamental theories of phenomena in climate science, especially physical processes that best describe sub-regional climate.}, journal={2013 IEEE 13TH INTERNATIONAL CONFERENCE ON DATA MINING (ICDM)}, author={Gonzalez, Doel L., II and Pendse, Saurabh V. and Padmanabhan, Kanchana and Angus, Michael P. and Tetteh, Isaac K. and Srinivas, Shashank and Villanes, Andrea and Semazzi, Fredrick and Kumar, Vipin and Samatova, Nagiza F.}, year={2013}, pages={1055–1060} } @article{tian_samatova_2013, title={Global Alignment of Pairwise Protein Interaction Networks for Maximal Common Conserved Patterns}, volume={2013}, ISSN={["2314-4378"]}, DOI={10.1155/2013/670623}, abstractNote={A number of tools for the alignment of protein-protein interaction (PPI) networks have laid the foundation for PPI network analysis. Most of alignment tools focus on finding conserved interaction regions across the PPI networks through either local or global mapping of similar sequences. Researchers are still trying to improve the speed, scalability, and accuracy of network alignment. In view of this, we introduce a connected-components based fast algorithm, HopeMap, for network alignment. Observing that the size of true orthologs across species is small comparing to the total number of proteins in all species, we take a different approach based on a precompiled list of homologs identified by KO terms. Applying this approach toS. cerevisiae(yeast) andD. melanogaster(fly),E. coliK12 andS. typhimurium,E. coliK12 andC. crescenttus, we analyze all clusters identified in the alignment. The results are evaluated through up-to-date known gene annotations, gene ontology (GO), and KEGG ortholog groups (KO). Comparing to existing tools, our approach is fast with linear computational cost, highly accurate in terms of KO and GO terms specificity and sensitivity, and can be extended to multiple alignments easily.}, journal={INTERNATIONAL JOURNAL OF GENOMICS}, author={Tian, Wenhong and Samatova, Nagiza F.}, year={2013} } @article{liu_logan_tian_abbasi_podhorszki_choi_klasky_tchoua_lofstead_oldfield_et al._2014, title={Hello ADIOS: the challenges and lessons of developing leadership class I/O frameworks}, volume={26}, ISSN={["1532-0634"]}, DOI={10.1002/cpe.3125}, abstractNote={SUMMARYApplications running on leadership platforms are more and more bottlenecked by storage input/output (I/O). In an effort to combat the increasing disparity between I/O throughput and compute capability, we created Adaptable IO System (ADIOS) in 2005. Focusing on putting users first with a service oriented architecture, we combined cutting edge research into new I/O techniques with a design effort to create near optimal I/O methods. As a result, ADIOS provides the highest level of synchronous I/O performance for a number of mission critical applications at various Department of Energy Leadership Computing Facilities. Meanwhile ADIOS is leading the push for next generation techniques including staging and data processing pipelines. In this paper, we describe the startling observations we have made in the last half decade of I/O research and development, and elaborate the lessons we have learned along this journey. We also detail some of the challenges that remain as we look toward the coming Exascale era. Copyright © 2013 John Wiley & Sons, Ltd.}, number={7}, journal={CONCURRENCY AND COMPUTATION-PRACTICE & EXPERIENCE}, author={Liu, Qing and Logan, Jeremy and Tian, Yuan and Abbasi, Hasan and Podhorszki, Norbert and Choi, Jong Youl and Klasky, Scott and Tchoua, Roselyne and Lofstead, Jay and Oldfield, Ron and et al.}, year={2014}, month={May}, pages={1453–1473} } @article{gong_boyuka_zou_liu_podhorszki_klasky_ma_samatova_2013, title={PARLO: PArallel Run-time Layout Optimization for Scientific Data Explorations with Heterogeneous Access Patterns}, ISBN={["978-1-4673-6465-2"]}, ISSN={["2376-4414"]}, DOI={10.1109/ccgrid.2013.58}, abstractNote={The size and scope of cutting-edge scientific simulations are growing much faster than the I/O and storage capabilities of their run-time environments. The growing gap is exacerbated by exploratory, data-intensive analytics, such as querying simulation data with multivariate, spatio-temporal constraints, which induces heterogeneous access patterns that stress the performance of the underlying storage system. Previous work addresses data layout and indexing techniques to improve query performance for a single access pattern, which is not sufficient for complex analytics jobs. We present PARLO a parallel run-time layout optimization framework, to achieve multi-level data layout optimization for scientific applications at run-time before data is written to storage. The layout schemes optimize for heterogeneous access patterns with user-specified priorities. PARLO is integrated with ADIOS, a high-performance parallel I/O middleware for large-scale HPC applications, to achieve user-transparent, light-weight layout optimization for scientific datasets. It offers simple XML-based configuration for users to achieve flexible layout optimization without the need to modify or recompile application codes. Experiments show that PARLO improves performance by 2 to 26 times for queries with heterogeneous access patterns compared to state-of-the-art scientific database management systems. Compared to traditional post-processing approaches, its underlying run-time layout optimization achieves a 56% savings in processing time and a reduction in storage overhead of up to 50%. PARLO also exhibits a low run-time resource requirement, while also limiting the performance impact on running applications to a reasonable level.}, journal={PROCEEDINGS OF THE 2013 13TH IEEE/ACM INTERNATIONAL SYMPOSIUM ON CLUSTER, CLOUD AND GRID COMPUTING (CCGRID 2013)}, author={Gong, Zhenhuan and Boyuka, David A., II and Zou, Xiaocheng and Liu, Qing and Podhorszki, Norbert and Klasky, Scott and Ma, Xiaosong and Samatova, Nagiza F.}, year={2013}, pages={343–351} } @inbook{jenkins_arkatkar_lakshminarasimhan_shah_schendel_ethier_chang_chen_kolla_klasky_et al._2012, title={Analytics-Driven Lossless Data Compression for Rapid In-situ Indexing, Storing, and Querying}, ISBN={9783642325960 9783642325977}, ISSN={0302-9743 1611-3349}, url={http://dx.doi.org/10.1007/978-3-642-32597-7_2}, DOI={10.1007/978-3-642-32597-7_2}, abstractNote={The analysis of scientific simulations is highly data-intensive and is becoming an increasingly important challenge. Peta-scale data sets require the use of light-weight query-driven analysis methods, as opposed to heavy-weight schemes that optimize for speed at the expense of size. This paper is an attempt in the direction of query processing over losslessly compressed scientific data. We propose a co-designed double-precision compression and indexing methodology for range queries by performing unique-value-based binning on the most significant bytes of double precision data (sign, exponent, and most significant mantissa bits), and inverting the resulting metadata to produce an inverted index over a reduced data representation. Without the inverted index, our method matches or improves compression ratios over both general-purpose and floating-point compression utilities. The inverted index is light-weight, and the overall storage requirement for both reduced column and index is less than 135%, whereas existing DBMS technologies can require 200-400%. As a proof-of-concept, we evaluate univariate range queries that additionally return column values, a critical component of data analytics, against state-of-the-art bitmap indexing technology, showing multi-fold query performance improvements.}, booktitle={Lecture Notes in Computer Science}, publisher={Springer Berlin Heidelberg}, author={Jenkins, John and Arkatkar, Isha and Lakshminarasimhan, Sriram and Shah, Neil and Schendel, Eric R. and Ethier, Stephane and Chang, Choong-Seock and Chen, Jacqueline H. and Kolla, Hemanth and Klasky, Scott and et al.}, year={2012}, pages={16–30} } @inproceedings{jenkins_schendel_lakshminarasimhan_boyuka_rogers_ethier_ross_klasky_samatova_2012, title={Byte-precision level of detail processing for variable precision analytics}, DOI={10.1109/sc.2012.26}, abstractNote={I/O bottlenecks in HPC applications are becoming a more pressing problem as compute capabilities continue to outpace I/O capabilities. While double-precision simulation data often must be stored losslessly, the loss of some of the fractional component may introduce acceptably small errors to many types of scientific analyses. Given this observation, we develop a precision level of detail (APLOD) library, which partitions double-precision datasets along user-defined byte boundaries. APLOD parameterizes the analysis accuracy-I/O performance tradeoff, bounds maximum relative error, maintains I/O access patterns compared to full precision, and operates with low overhead. Using ADIOS as an I/O use-case, we show proportional reduction in disk access time to the degree of precision. Finally, we show the effects of partial precision analysis on accuracy for operations such as k-means and Fourier analysis, finding a strong applicability for the use of varying degrees of precision to reduce the cost of analyzing extreme-scale data.}, booktitle={International conference for high performance computing networking}, author={Jenkins, J. and Schendel, E. R. and Lakshminarasimhan, S. and Boyuka, D. A. and Rogers, T. and Ethier, S. and Ross, R. and Klasky, S. and Samatova, N. F.}, year={2012} } @article{chen_hendrix_guan_tetteh_choudhary_semazzi_samatova_2013, title={Discovery of extreme events-related communities in contrasting groups of physical system networks}, volume={27}, ISSN={["1573-756X"]}, DOI={10.1007/s10618-012-0289-3}, abstractNote={The latent behavior of a physical system that can exhibit extreme events such as hurricanes or rainfalls, is complex. Recently, a very promising means for studying complex systems has emerged through the concept of complex networks. Networks representing relationships between individual objects usually exhibit community dynamics. Conventional community detection methods mainly focus on either mining frequent subgraphs in a network or detecting stable communities in time-varying networks. In this paper, we formulate a novel problem—detection of predictive and phase-biased communities in contrasting groups of networks, and propose an efficient and effective machine learning solution for finding such anomalous communities. We build different groups of networks corresponding to different system's phases, such as higher or low hurricane activity, discover phase-related system components as seeds to help bound the search space of community generation in each network, and use the proposed contrast-based technique to identify the changing communities across different groups. The detected anomalous communities are hypothesized (1) to play an important role in defining the target system's state(s) and (2) to improve the predictive skill of the system's states when used collectively in the ensemble of predictive models. When tested on the two important extreme event problems—identification of tropical cyclone-related and of African Sahel rainfall-related climate indices—our algorithm demonstrated the superior performance in terms of various skill and robustness metrics, including 8–16 % accuracy increase, as well as physical interpretability of detected communities. The experimental results also show the efficiency of our algorithm on synthetic datasets.}, number={2}, journal={DATA MINING AND KNOWLEDGE DISCOVERY}, author={Chen, Zhengzhang and Hendrix, William and Guan, Hang and Tetteh, Isaac K. and Choudhary, Alok and Semazzi, Fredrick and Samatova, Nagiza F.}, year={2013}, month={Sep}, pages={225–258} } @article{padmanabhan_wang_samatova_2012, title={Functional Annotation of Hierarchical Modularity}, volume={7}, ISSN={["1932-6203"]}, DOI={10.1371/journal.pone.0033744}, abstractNote={In biological networks of molecular interactions in a cell, network motifs that are biologically relevant are also functionally coherent, or form functional modules. These functionally coherent modules combine in a hierarchical manner into larger, less cohesive subsystems, thus revealing one of the essential design principles of system-level cellular organization and function–hierarchical modularity. Arguably, hierarchical modularity has not been explicitly taken into consideration by most, if not all, functional annotation systems. As a result, the existing methods would often fail to assign a statistically significant functional coherence score to biologically relevant molecular machines. We developed a methodology for hierarchical functional annotation. Given the hierarchical taxonomy of functional concepts (e.g., Gene Ontology) and the association of individual genes or proteins with these concepts (e.g., GO terms), our method will assign a Hierarchical Modularity Score (HMS) to each node in the hierarchy of functional modules; the HMS score and its value measure functional coherence of each module in the hierarchy. While existing methods annotate each module with a set of “enriched” functional terms in a bag of genes, our complementary method provides the hierarchical functional annotation of the modules and their hierarchically organized components. A hierarchical organization of functional modules often comes as a bi-product of cluster analysis of gene expression data or protein interaction data. Otherwise, our method will automatically build such a hierarchy by directly incorporating the functional taxonomy information into the hierarchy search process and by allowing multi-functional genes to be part of more than one component in the hierarchy. In addition, its underlying HMS scoring metric ensures that functional specificity of the terms across different levels of the hierarchical taxonomy is properly treated. We have evaluated our method using Saccharomyces cerevisiae data from KEGG and MIPS databases and several other computationally derived and curated datasets. The code and additional supplemental files can be obtained from http://code.google.com/p/functional-annotation-of-hierarchical-modularity/ (Accessed 2012 March 13).}, number={4}, journal={PLOS ONE}, author={Padmanabhan, Kanchana and Wang, Kuangyu and Samatova, Nagiza F.}, year={2012}, month={Apr} } @article{meleshko_samatova_melechko_2012, title={Group analysis of the thin film dewetting equation}, volume={47}, ISSN={["0020-7462"]}, DOI={10.1016/j.ijnonlinmec.2011.08.005}, abstractNote={The classical Lie group analysis method in addition to constructing exact solutions, provides a regular procedure for mathematical modeling by classifying differential equations with respect to arbitrary elements. In the present paper group analysis is applied for modeling in dewetting of a thin film. The paper is focused on the equation:ht+ΔΔh=div(m∇V(h)).Group classification selects the functions m(h) and V(h) such that Eq. (2) possess additional symmetry properties. The group classification separates the set of the functions m(h) and V(h) into different classes with respect to symmetries.}, number={1}, journal={INTERNATIONAL JOURNAL OF NON-LINEAR MECHANICS}, author={Meleshko, S. V. and Samatova, N. F. and Melechko, A. V.}, year={2012}, month={Jan}, pages={9–13} } @article{gonzalez_chen_tetteh_pansombut_semazzi_kumar_melechko_samatova_2012, title={Hierarchical Classifier-Regression Ensemble for Multi-Phase Non-Linear Dynamic System Response Prediction: Application to Climate Analysis}, ISBN={["978-1-4673-5164-5"]}, ISSN={["2375-9232"]}, DOI={10.1109/icdmw.2012.133}, abstractNote={A dynamic physical system often undergoes phase transitions in response to fluctuations induced on system parameters. For example, hurricane activity is the climate system's response initiated by a liquid-vapor phase transition associated with non-linearly coupled fluctuations in the ocean and the atmosphere. Because our quantitative knowledge about highly non-linear dynamic systems is very meager, scientists often resort to linear regression techniques such as Least Absolute Deviation (LAD) to learn the non-linear system's response (e.g., hurricane activity) from observed or simulated system's parameters (e.g., temperature, precipitable water, pressure). While insightful, such models still offer limited predictability, and alternatives intended to capture non-linear behaviors such as Stepwise Regression are often controversial in nature. In this paper, we hypothesize that one of the primary reasons for lack of predictability is the treatment of an inherently multi-phase system as being phase less. To bridge this gap, we propose a hybrid approach that first predicts the phase the system is in, and then estimates the magnitude of the system's response using the regression model optimized for this phase. Our approach is designed for systems that could be characterized by multi-variate spatio-temporal data from observations, simulations, or both.}, journal={12TH IEEE INTERNATIONAL CONFERENCE ON DATA MINING WORKSHOPS (ICDMW 2012)}, author={Gonzalez, Doel L., II and Chen, Zhengzhang and Tetteh, Isaac K. and Pansombut, Tatdow and Semazzi, Fredrick and Kumar, Vipin and Melechko, Anatoli and Samatova, Nagiza F.}, year={2012}, pages={781–788} } @article{lakshminarasimhan_shah_ethier_ku_chang_klasky_latham_ross_samatova_2013, title={ISABELA for effective in situ compression of scientific data}, volume={25}, ISSN={["1532-0634"]}, DOI={10.1002/cpe.2887}, abstractNote={SUMMARYExploding dataset sizes from extreme‐scale scientific simulations necessitates efficient data management and reduction schemes to mitigate I/O costs. With the discrepancy between I/O bandwidth and computational power, scientists are forced to capture data infrequently, thereby making data collection an inherently lossy process. Although data compression can be an effective solution, the random nature of real‐valued scientific datasets renders lossless compression routines ineffective. These techniques also impose significant overhead during decompression, making them unsuitable for data analysis and visualization, which require repeated data access.To address this problem, we propose an effective method for In situ Sort‐And‐B‐spline Error‐bounded Lossy Abatement (ISABELA) of scientific data that is widely regarded as effectively incompressible. With ISABELA, we apply a pre‐conditioner to seemingly random and noisy data along spatial resolution to achieve an accurate fitting model that guarantees a ⩾0.99 correlation with the original data. We further take advantage of temporal patterns in scientific data to compress data by ≈ 85%, while introducing only a negligible overhead on simulations in terms of runtime. ISABELA significantly outperforms existing lossy compression methods, such as wavelet compression, in terms of data reduction and accuracy.We extend upon our previous paper by additionally building a communication‐free, scalable parallel storage framework on top of ISABELA‐compressed data that is ideally suited for extreme‐scale analytical processing. The basis for our storage framework is an inherently local decompression method (it need not decode the entire data), which allows for random access decompression and low‐overhead task division that can be exploited over heterogeneous architectures. Furthermore, analytical operations such as correlation and query processing run quickly and accurately over data in the compressed space. Copyright © 2012 John Wiley & Sons, Ltd.}, number={4}, journal={CONCURRENCY AND COMPUTATION-PRACTICE & EXPERIENCE}, author={Lakshminarasimhan, Sriram and Shah, Neil and Ethier, Stephane and Ku, Seung-Hoe and Chang, C. S. and Klasky, Scott and Latham, Rob and Ross, Rob and Samatova, Nagiza F.}, year={2013}, pages={524–540} } @article{schendel_jin_shah_chen_chang_ku_ethier_klasky_latham_ross_et al._2012, title={ISOBAR Preconditioner for Effective and High-throughput Lossless Data Compression}, ISSN={["1084-4627"]}, DOI={10.1109/icde.2012.114}, abstractNote={Efficient handling of large volumes of data is a necessity for exascale scientific applications and database systems. To address the growing imbalance between the amount of available storage and the amount of data being produced by high speed (FLOPS) processors on the system, data must be compressed to reduce the total amount of data placed on the file systems. General-purpose loss less compression frameworks, such as zlib and bzlib2, are commonly used on datasets requiring loss less compression. Quite often, however, many scientific data sets compress poorly, referred to as hard-to-compress datasets, due to the negative impact of highly entropic content represented within the data. An important problem in better loss less data compression is to identify the hard-to-compress information and subsequently optimize the compression techniques at the byte-level. To address this challenge, we introduce the In-Situ Orthogonal Byte Aggregate Reduction Compression (ISOBAR-compress) methodology as a preconditioner of loss less compression to identify and optimize the compression efficiency and throughput of hard-to-compress datasets.}, journal={2012 IEEE 28TH INTERNATIONAL CONFERENCE ON DATA ENGINEERING (ICDE)}, author={Schendel, Eric R. and Jin, Ye and Shah, Neil and Chen, Jackie and Chang, C. S. and Ku, Seung-Hoe and Ethier, Stephane and Klasky, Scott and Latham, Robert and Ross, Robert and et al.}, year={2012}, pages={138–149} } @article{padmanabhan_wilson_rocha_wang_mihelcic_samatova_2012, title={In-silico identification of phenotype-biased functional modules}, volume={10}, ISSN={["1477-5956"]}, DOI={10.1186/1477-5956-10-s1-s2}, abstractNote={Abstract Background Phenotypes exhibited by microorganisms can be useful for several purposes, e.g., ethanol as an alternate fuel. Sometimes, the target phenotype maybe required in combination with other phenotypes, in order to be useful, for e.g., an industrial process may require that the organism survive in an anaerobic, alcohol rich environment and be able to feed on both hexose and pentose sugars to produce ethanol. This combination of traits may not be available in any existing organism or if they do exist, the mechanisms involved in the phenotype-expression may not be efficient enough to be useful. Thus, it may be required to genetically modify microorganisms. However, before any genetic modification can take place, it is important to identify the underlying cellular subsystems responsible for the expression of the target phenotype. Results In this paper, we develop a method to identify statistically significant and phenotypically-biased functional modules. The method can compare the organismal network information from hundreds of phenotype expressing and phenotype non-expressing organisms to identify cellular subsystems that are more prone to occur in phenotype-expressing organisms than in phenotype non-expressing organisms. We have provided literature evidence that the phenotype-biased modules identified for phenotypes such as hydrogen production (dark and light fermentation), respiration, gram-positive, gram-negative and motility, are indeed phenotype-related. Conclusion Thus we have proposed a methodology to identify phenotype-biased cellular subsystems. We have shown the effectiveness of our methodology by applying it to several target phenotypes. The code and all supplemental files can be downloaded from (http://freescience.org/cs/phenotype-biased-biclusters/). }, journal={PROTEOME SCIENCE}, author={Padmanabhan, Kanchana and Wilson, Kevin and Rocha, Andrea M. and Wang, Kuangyu and Mihelcic, James R. and Samatova, Nagiza F.}, year={2012}, month={Jun} } @article{gong_lakshminarasimhan_jenkins_kolla_ethier_chen_ross_klasky_samatova_2012, title={Multi-level Layout Optimization for Efficient Spatio-temporal Queries on ISABELA-compressed Data}, ISSN={["1530-2075"]}, DOI={10.1109/ipdps.2012.83}, abstractNote={The size and scope of cutting-edge scientific simulations are growing much faster than the I/O subsystems of their runtime environments, not only making I/O the primary bottleneck, but also consuming space that pushes the storage capacities of many computing facilities. These problems are exacerbated by the need to perform data-intensive analytics applications, such as querying the dataset by variable and spatio-temporal constraints, for what current database technologies commonly build query indices of size greater than that of the raw data. To help solve these problems, we present a parallel query-processing engine that can handle both range queries and queries with spatio-temporal constraints, on B-spline compressed data with user-controlled accuracy. Our method adapts to widening gaps between computation and I/O performance by querying on compressed metadata separated into bins by variable values, utilizing Hilbert space-filling curves to optimize for spatial constraints and aggregating data access to improve locality of per-bin stored data, reducing the false positive rate and latency bound I/O operations (such as seek) substantially. We show our method to be efficient with respect to storage, computation, and I/O compared to existing database technologies optimized for query processing on scientific data.}, journal={2012 IEEE 26TH INTERNATIONAL PARALLEL AND DISTRIBUTED PROCESSING SYMPOSIUM (IPDPS)}, author={Gong, Zhenhuan and Lakshminarasimhan, Sriram and Jenkins, John and Kolla, Hemanth and Ethier, Stephane and Chen, Jackie and Ross, Robert and Klasky, Scott and Samatova, Nagiza F.}, year={2012}, pages={873–884} } @article{schmidt_rocha_padmanabhan_shpanskaya_banfield_scott_mihelcic_samatova_2012, title={NIBBS-Search for Fast and Accurate Prediction of Phenotype-Biased Metabolic Systems}, volume={8}, ISSN={["1553-7358"]}, DOI={10.1371/journal.pcbi.1002490}, abstractNote={Understanding of genotype-phenotype associations is important not only for furthering our knowledge on internal cellular processes, but also essential for providing the foundation necessary for genetic engineering of microorganisms for industrial use (e.g., production of bioenergy or biofuels). However, genotype-phenotype associations alone do not provide enough information to alter an organism's genome to either suppress or exhibit a phenotype. It is important to look at the phenotype-related genes in the context of the genome-scale network to understand how the genes interact with other genes in the organism. Identification of metabolic subsystems involved in the expression of the phenotype is one way of placing the phenotype-related genes in the context of the entire network. A metabolic system refers to a metabolic network subgraph; nodes are compounds and edges labels are the enzymes that catalyze the reaction. The metabolic subsystem could be part of a single metabolic pathway or span parts of multiple pathways. Arguably, comparative genome-scale metabolic network analysis is a promising strategy to identify these phenotype-related metabolic subsystems. Network Instance-Based Biased Subgraph Search (NIBBS) is a graph-theoretic method for genome-scale metabolic network comparative analysis that can identify metabolic systems that are statistically biased toward phenotype-expressing organismal networks. We set up experiments with target phenotypes like hydrogen production, TCA expression, and acid-tolerance. We show via extensive literature search that some of the resulting metabolic subsystems are indeed phenotype-related and formulate hypotheses for other systems in terms of their role in phenotype expression. NIBBS is also orders of magnitude faster than MULE, one of the most efficient maximal frequent subgraph mining algorithms that could be adjusted for this problem. Also, the set of phenotype-biased metabolic systems output by NIBBS comes very close to the set of phenotype-biased subgraphs output by an exact maximally-biased subgraph enumeration algorithm ( MBS-Enum ). The code (NIBBS and the module to visualize the identified subsystems) is available at http://freescience.org/cs/NIBBS.}, number={5}, journal={PLOS COMPUTATIONAL BIOLOGY}, author={Schmidt, Matthew C. and Rocha, Andrea M. and Padmanabhan, Kanchana and Shpanskaya, Yekaterina and Banfield, Jill and Scott, Kathleen and Mihelcic, James R. and Samatova, Nagiza F.}, year={2012}, month={May} } @inproceedings{lakshminarasimhan_kumar_liao_choudhary_kumar_samatova_2012, title={On the path to sustainable, scalable, and energy-efficient data analytics: Challenges, promises, and future directions}, DOI={10.1109/igcc.2012.6322265}, abstractNote={As scientific data is reaching exascale, scalable and energy efficient data analytics is quickly becoming a top notch priority. Yet, a sustainable solution to this problem is hampered by a number of technical challenges that get exacerbated with the emerging hardware and software technology trends. In this paper, we present a number of recently created “secret sauces” that promise to address some of these challenges. We discuss transformative approaches to efficient data reduction, analytics-driven query processing, scalable analytical kernels, approximate analytics, among others. We propose a number of future directions that could be pursued on the path to sustainable data analytics at scale.}, booktitle={2012 International Green Computing Conference (IGCC)}, author={Lakshminarasimhan, S. and Kumar, P. and Liao, W. K. and Choudhary, A. and Kumar, V. and Samatova, N. F.}, year={2012} } @article{chen_hendrix_samatova_2012, title={Community-based anomaly detection in evolutionary networks}, volume={39}, DOI={10.1007/s10844-011-0183-2}, abstractNote={Networks of dynamic systems, including social networks, the World Wide Web, climate networks, and biological networks, can be highly clustered. Detecting clusters, or communities, in such dynamic networks is an emerging area of research; however, less work has been done in terms of detecting community-based anomalies. While there has been some previous work on detecting anomalies in graph-based data, none of these anomaly detection approaches have considered an important property of evolutionary networks—their community structure. In this work, we present an approach to uncover community-based anomalies in evolutionary networks characterized by overlapping communities. We develop a parameter-free and scalable algorithm using a proposed representative-based technique to detect all six possible types of community-based anomalies: grown, shrunken, merged, split, born, and vanished communities. We detail the underlying theory required to guarantee the correctness of the algorithm. We measure the performance of the community-based anomaly detection algorithm by comparison to a non–representative-based algorithm on synthetic networks, and our experiments on synthetic datasets show that our algorithm achieves a runtime speedup of 11–46 over the baseline algorithm. We have also applied our algorithm to two real-world evolutionary networks, Food Web and Enron Email. Significant and informative community-based anomaly dynamics have been detected in both cases.}, number={1}, journal={Journal of Intelligent Information Systems}, author={Chen, Z. Z. and Hendrix, W. and Samatova, N. F.}, year={2012}, pages={59–85} } @inbook{lakshminarasimhan_shah_ethier_klasky_latham_ross_samatova_2011, title={Compressing the Incompressible with ISABELA: In-situ Reduction of Spatio-temporal Data}, ISBN={9783642233999 9783642234002}, ISSN={0302-9743 1611-3349}, url={http://dx.doi.org/10.1007/978-3-642-23400-2_34}, DOI={10.1007/978-3-642-23400-2_34}, abstractNote={Modern large-scale scientific simulations running on HPC systems generate data in the order of terabytes during a single run. To lessen the I/O load during a simulation run, scientists are forced to capture data infrequently, thereby making data collection an inherently lossy process. Yet, lossless compression techniques are hardly suitable for scientific data due to its inherently random nature; for the applications used here, they offer less than 10% compression rate. They also impose significant overhead during decompression, making them unsuitable for data analysis and visualization that require repeated data access. To address this problem, we propose an effective method for In-situ Sort-And-B-spline Error-bounded Lossy Abatement (ISABELA) of scientific data that is widely regarded as effectively incompressible. With ISABELA, we apply a preconditioner to seemingly random and noisy data along spatial resolution to achieve an accurate fitting model that guarantees a ≥ 0.99 correlation with the original data. We further take advantage of temporal patterns in scientific data to compress data by ≈ 85%, while introducing only a negligible overhead on simulations in terms of runtime. ISABELA significantly outperforms existing lossy compression methods, such as Wavelet compression. Moreover, besides being a communication-free and scalable compression technique, ISABELA is an inherently local decompression method, namely it does not decode the entire data, making it attractive for random access.}, booktitle={Euro-Par 2011 Parallel Processing}, publisher={Springer Berlin Heidelberg}, author={Lakshminarasimhan, Sriram and Shah, Neil and Ethier, Stephane and Klasky, Scott and Latham, Rob and Ross, Rob and Samatova, Nagiza F.}, year={2011}, pages={366–379} } @article{hendrix_rocha_padmanabhan_choudhary_scott_mihelcic_samatova_2011, title={DENSE: efficient and prior knowledge-driven discovery of phenotype-associated protein functional modules}, volume={5}, ISSN={["1752-0509"]}, DOI={10.1186/1752-0509-5-172}, abstractNote={AbstractBackgroundIdentifying cellular subsystems that are involved in the expression of a target phenotype has been a very active research area for the past several years. In this paper,cellular subsystemrefers to a group of genes (or proteins) that interact and carry out a common function in the cell. Most studies identify genes associated with a phenotype on the basis of some statistical bias, others have extended these statistical methods to analyze functional modules and biological pathways for phenotype-relatedness. However, a biologist might often have a specific question in mind while performing such analysis and most of the resulting subsystems obtained by the existing methods might be largely irrelevant to the question in hand. Arguably, it would be valuable to incorporate biologist's knowledge about the phenotype into the algorithm. This way, it is anticipated that the resulting subsytems would not only be related to the target phenotype but also contain information that the biologist is likely to be interested in.ResultsIn this paper we introduce a fast and theoretically guranteed method calledDENSE(Dense and ENriched Subgraph Enumeration) that can take in as input a biologist'spriorknowledge as a set of query proteins and identify all the dense functional modules in a biological network that contain some part of the query vertices. The density (in terms of the number of network egdes) and the enrichment (the number of query proteins in the resulting functional module) can be manipulated via two parameters γ andμ, respectively.ConclusionThis algorithm has been applied to the protein functional association network ofClostridium acetobutylicumATCC 824, a hydrogen producing, acid-tolerant organism. The algorithm was able to verify relationships known to exist in literature and also some previously unknown relationships including those with regulatory and signaling functions. Additionally, we were also able to hypothesize that some uncharacterized proteins are likely associated with the target phenotype. The DENSE code can be downloaded fromhttp://www.freescience.org/cs/DENSE/}, journal={BMC SYSTEMS BIOLOGY}, author={Hendrix, Willam and Rocha, Andrea M. and Padmanabhan, Kanchana and Choudhary, Alok and Scott, Kathleen and Mihelcic, James R. and Samatova, Nagiza F.}, year={2011}, month={Oct} } @article{schmidt_rocha_padmanabhan_chen_scott_mihelcic_samatova_2011, title={Efficient alpha, beta-motif finder for identification of phenotype-related functional modules}, volume={12}, ISSN={["1471-2105"]}, DOI={10.1186/1471-2105-12-440}, abstractNote={Abstract Background Microbial communities in their natural environments exhibit phenotypes that can directly cause particular diseases, convert biomass or wastewater to energy, or degrade various environmental contaminants. Understanding how these communities realize specific phenotypic traits (e.g., carbon fixation, hydrogen production) is critical for addressing health, bioremediation, or bioenergy problems. Results In this paper, we describe a graph-theoretical method for in silico prediction of the cellular subsystems that are related to the expression of a target phenotype. The proposed (α, β)-motif finder approach allows for identification of these phenotype-related subsystems that, in addition to metabolic subsystems, could include their regulators, sensors, transporters, and even uncharacterized proteins. By comparing dozens of genome-scale networks of functionally associated proteins, our method efficiently identifies those statistically significant functional modules that are in at least α networks of phenotype-expressing organisms but appear in no more than β networks of organisms that do not exhibit the target phenotype. It has been shown via various experiments that the enumerated modules are indeed related to phenotype-expression when tested with different target phenotypes like hydrogen production, motility, aerobic respiration, and acid-tolerance. Conclusion Thus, we have proposed a methodology that can identify potential statistically significant phenotype-related functional modules. The functional module is modeled as an (α, β)-clique, where α and β are two criteria introduced in this work. We also propose a novel network model, called the two-typed, divided network. The new network model and the criteria make the problem tractable even while very large networks are being compared. The code can be downloaded from http://www.freescience.org/cs/ABClique/ }, journal={BMC BIOINFORMATICS}, author={Schmidt, Matthew C. and Rocha, Andrea M. and Padmanabhan, Kanchana and Chen, Zhengzhang and Scott, Kathleen and Mihelcic, James R. and Samatova, Nagiza F.}, year={2011}, month={Nov} } @inbook{jenkins_arkatkar_owens_choudhary_samatova_2011, title={Lessons Learned from Exploring the Backtracking Paradigm on the GPU}, ISBN={9783642233968 9783642233975}, ISSN={0302-9743 1611-3349}, url={http://dx.doi.org/10.1007/978-3-642-23397-5_42}, DOI={10.1007/978-3-642-23397-5_42}, abstractNote={We explore the backtracking paradigm with properties seen as sub-optimal for GPU architectures, using as a case study the maximal clique enumeration problem, and find that the presence of these properties limit GPU performance to approximately 1.4–2.25 times a single CPU core. The GPU performance "lessons" we find critical to providing this performance include a coarse-and-fine-grain parallelization of the search space, a low-overhead load-balanced distribution of work, global memory latency hiding through coalescence, saturation, and shared memory utilization, and the use of GPU output buffering as a solution to irregular workloads and a large solution domain. We also find a strong reliance on an efficient global problem structure representation that bounds any efficiencies gained from these lessons, and discuss the meanings of these results to backtracking problems in general.}, booktitle={Euro-Par 2011 Parallel Processing}, publisher={Springer Berlin Heidelberg}, author={Jenkins, John and Arkatkar, Isha and Owens, John D. and Choudhary, Alok and Samatova, Nagiza F.}, year={2011}, pages={425–437} } @article{belnap_pan_denef_samatova_hettich_banfield_2011, title={Quantitative proteomic analyses of the response of acidophilic microbial communities to different pH conditions}, volume={5}, ISSN={1751-7362 1751-7370}, url={http://dx.doi.org/10.1038/ismej.2010.200}, DOI={10.1038/ismej.2010.200}, abstractNote={Abstract Extensive genomic characterization of multi-species acid mine drainage microbial consortia combined with laboratory cultivation has enabled the application of quantitative proteomic analyses at the community level. In this study, quantitative proteomic comparisons were used to functionally characterize laboratory-cultivated acidophilic communities sustained in pH 1.45 or 0.85 conditions. The distributions of all proteins identified for individual organisms indicated biases for either high or low pH, and suggests pH-specific niche partitioning for low abundance bacteria and archaea. Although the proteome of the dominant bacterium, Leptospirillum group II, was largely unaffected by pH treatments, analysis of functional categories indicated proteins involved in amino acid and nucleotide metabolism, as well as cell membrane/envelope biogenesis were overrepresented at high pH. Comparison of specific protein abundances indicates higher pH conditions favor Leptospirillum group III, whereas low pH conditions promote the growth of certain archaea. Thus, quantitative proteomic comparisons revealed distinct differences in community composition and metabolic function of individual organisms during different pH treatments. Proteomic analysis revealed other aspects of community function. Different numbers of phage proteins were identified across biological replicates, indicating stochastic spatial heterogeneity of phage outbreaks. Additionally, proteomic data were used to identify a previously unknown genotypic variant of Leptospirillum group II, an indication of selection for a specific Leptospirillum group II population in laboratory communities. Our results confirm the importance of pH and related geochemical factors in fine-tuning acidophilic microbial community structure and function at the species and strain level, and demonstrate the broad utility of proteomics in laboratory community studies.}, number={7}, journal={The ISME Journal}, publisher={Springer Science and Business Media LLC}, author={Belnap, Christopher P and Pan, Chongle and Denef, Vincent J and Samatova, Nagiza F and Hettich, Robert L and Banfield, Jillian F}, year={2011}, month={Jan}, pages={1152–1161} } @article{li_ma_yoginath_kora_samatova_2011, title={Transparent runtime parallelization of the R scripting language}, volume={71}, ISSN={["1096-0848"]}, DOI={10.1016/j.jpdc.2010.08.013}, abstractNote={Scripting languages such as R and Matlab are widely used in scientific data processing. As the data volume and the complexity of analysis tasks both grow, sequential data processing using these tools often becomes the bottleneck in scientific workflows. We describe pR, a runtime framework for automatic and transparent parallelization of the popular R language used in statistical computing. Recognizing scripting languages’ interpreted nature and data analysis codes’ use pattern, we propose several novel techniques: (1) applying parallelizing compiler technology to runtime, whole-program dependence analysis of scripting languages, (2) incremental code analysis assisted with evaluation results, and (3) runtime parallelization of file accesses. Our framework does not require any modification to either the source code or the underlying R implementation. Experimental results demonstrate that pR can exploit both task and data parallelism transparently and overall has better performance as well as scalability compared to an existing parallel R package that requires code modification.}, number={2}, journal={JOURNAL OF PARALLEL AND DISTRIBUTED COMPUTING}, author={Li, Jiangtian and Ma, Xiaosong and Yoginath, Srikanth and Kora, Guruprasad and Samatova, Nagiza F.}, year={2011}, month={Feb}, pages={157–168} } @article{pan_park_mcdonald_carey_banfield_verberkmoes_hettich_samatova_2010, title={A high-throughput de novo sequencing approach for shotgun proteomics using high-resolution tandem mass spectrometry}, volume={11}, journal={BMC Bioinformatics}, author={Pan, C. and Park, B. H. and McDonald, W. H. and Carey, P. A. and Banfield, J. F. and VerBerkmoes, N. C. and Hettich, R. L. and Samatova, N. F.}, year={2010} } @article{lin_ma_feng_samatova_2011, title={Coordinating Computation and I/O in Massively Parallel Sequence Search}, volume={22}, ISSN={["1558-2183"]}, DOI={10.1109/tpds.2010.101}, abstractNote={With the explosive growth of genomic information, the searching of sequence databases has emerged as one of the most computation and data-intensive scientific applications. Our previous studies suggested that parallel genomic sequence-search possesses highly irregular computation and I/O patterns. Effectively addressing these runtime irregularities is thus the key to designing scalable sequence-search tools on massively parallel computers. While the computation scheduling for irregular scientific applications and the optimization of noncontiguous file accesses have been well-studied independently, little attention has been paid to the interplay between the two. In this paper, we systematically investigate the computation and I/O scheduling for data-intensive, irregular scientific applications within the context of genomic sequence search. Our study reveals that the lack of coordination between computation scheduling and I/O optimization could result in severe performance issues. We then propose an integrated scheduling approach that effectively improves sequence-search throughput by gracefully coordinating the dynamic load balancing of computation and high-performance noncontiguous I/O.}, number={4}, journal={IEEE TRANSACTIONS ON PARALLEL AND DISTRIBUTED SYSTEMS}, author={Lin, Heshan and Ma, Xiaosong and Feng, Wuchun and Samatova, Nagiza F.}, year={2011}, month={Apr}, pages={529–543} } @inproceedings{chen_wilson_jin_samatova_2010, title={Detecting and Tracking Community Dynamics in Evolutionary Networks}, DOI={10.1109/icdmw.2010.32}, abstractNote={Community structure or clustering is ubiquitous in many evolutionary networks including social networks, biological networks and financial market networks. Detecting and tracking community deviations in evolutionary networks can uncover important and interesting behaviors that are latent if we ignore the dynamic information. In biological networks, for example, a small variation in a gene community may indicate an event, such as gene fusion, gene fission, or gene decay. In contrast to the previous work on detecting communities in static graphs or tracking conserved communities in time-varying graphs, this paper first introduces the concept of community dynamics, and then shows that the baseline approach by enumerating all communities in each graph and comparing all pairs of communities between consecutive graphs is infeasible and impractical. We propose an efficient method for detecting and tracking community dynamics in evolutionary networks by introducing graph representatives and community representatives to avoid generating redundant communities and limit the search space. We measure the performance of the representative-based algorithm by comparison to the baseline algorithm on synthetic networks, and our experiments show that our algorithm achieves a runtime speedup of 11–46. The method has also been applied to two real-world evolutionary networks including Food Web and Enron Email. Significant and informative community dynamics have been detected in both cases.}, booktitle={The 10th IEEE International Conference on Data Mining Workshops}, publisher={Los Alamitos, Calif. : IEEE Computer Society}, author={Chen, Z and Wilson, K.A. and Jin, Y. and Samatova, N.F.}, year={2010}, pages={318–327} } @article{hendrix_schmidt_breimyer_samatova_2010, title={Theoretical underpinnings for maximal clique enumeration on perturbed graphs}, volume={411}, ISSN={["1879-2294"]}, DOI={10.1016/j.tcs.2010.03.011}, abstractNote={The problem of enumerating the maximal cliques of a graph is a computationally expensive problem with applications in a number of different domains. Sometimes the benefit of knowing the maximal clique enumeration (MCE) of a single graph is worth investing the initial computation time. However, when graphs are abstractions of noisy or uncertain data, the MCE of several closely related graphs may need to be found, and the computational cost of doing so becomes prohibitively expensive. Here, we present a method by which the cost of enumerating the set of maximal cliques for related graphs can be reduced. By using the MCE for some baseline graph, the MCE for a modified, or perturbed, graph may be obtained by enumerating only the maximal cliques that are created or destroyed by the perturbation. When the baseline and perturbed graphs are relatively similar, the difference set between the two MCEs can be overshadowed by the maximal cliques common to both. Thus, by enumerating only the difference set between the baseline and perturbed graphs’ MCEs, the computational cost of enumerating the maximal cliques of the perturbed graph can be reduced. We present necessary and sufficient conditions for enumerating difference sets when the perturbed graph is formed by several different types of perturbations. We also present results of an algorithm based on these conditions that demonstrate a speedup over traditional calculations of the MCE of perturbed, real biological networks.}, number={26-28}, journal={THEORETICAL COMPUTER SCIENCE}, author={Hendrix, William and Schmidt, Matthew C. and Breimyer, Paul and Samatova, Nagiza F.}, year={2010}, month={Jun}, pages={2520–2536} } @article{schmidt_samatova_thomas_park_2009, title={A scalable, parallel algorithm for maximal clique enumeration}, volume={69}, ISSN={["1096-0848"]}, DOI={10.1016/j.jpdc.2009.01.003}, abstractNote={The problem of maximal clique enumeration (MCE) is to enumerate all of the maximal cliques in a graph. Once enumerated, maximal cliques are widely used to solve problems in areas such as 3-D protein structure alignment, genome mapping, gene expression analysis, and detection of social hierarchies. Even the most efficient serial MCE algorithms require large amounts of time to enumerate the maximal cliques in networks arising from these problems that contain hundreds, thousands, or larger numbers of vertices. The previous attempts to provide practical solutions to the MCE problem through parallel implementation have had limited success, largely due to a number of challenges inherent to the nature of the MCE combinatorial search space. On the one hand, MCE algorithms often create a backtracking search tree that has a highly irregular and hard-or-impossible to predict structure; therefore, almost any static decomposition of the search tree by parallel processors results in highly unbalanced processor execution times. On the other hand, the data-intensive nature of the MCE problem often makes naive dynamic load distribution strategies that require extensive data movement prohibitively expensive. As a result, good scaling of the overall execution time of parallel MCE algorithms has been reported for only up to a couple hundred processors. In this paper, we propose a parallel, scalable, and memory-efficient MCE algorithm for distributed and/or shared memory high performance computing architectures, whose runtime scales linearly for thousands of processors on real-world application graphs with hundreds and thousands of nodes. Its scalability and efficiency are attributed to the proposed: (a) representation of the search tree decomposition to enable parallelization; (b) parallel depth-first backtracking search to both constrain the search space and minimize memory requirement; (c) least stringent synchronization to minimize data movement; and (d) on-demand work stealing intelligently coupled with work stack splitting to minimize computing elements’ idle time. To the best of our knowledge, the proposed parallel MCE algorithm is the first to achieve a linear scaling runtime using up to 2048 processors on Cray XT machines for a number of real-world biological networks.}, number={4}, journal={JOURNAL OF PARALLEL AND DISTRIBUTED COMPUTING}, author={Schmidt, Matthew C. and Samatova, Nagiza F. and Thomas, Kevin and Park, Byung-Hoon}, year={2009}, month={Apr}, pages={417–428} } @inproceedings{breimyer_green_kumar_samatova_2009, title={BioDEAL: community generation of biological annotations}, volume={9}, booktitle={BMC Medical Informatics and Decision Making}, author={Breimyer, P. and Green, N. and Kumar, V. and Samatova, N. F.}, year={2009} } @article{chang_ku_diamond_lin_parker_hahm_samatova_2009, title={Compressed ion temperature gradient turbulence in diverted tokamak edge}, volume={16}, ISSN={["1089-7674"]}, DOI={10.1063/1.3099329}, abstractNote={It is found from a heat-flux-driven full-f gyrokinetic particle simulation that there is ion temperature gradient (ITG) turbulence across an entire L-mode-like edge density pedestal in a diverted tokamak plasma in which the ion temperature gradient is mild without a pedestal structure, hence the normalized ion temperature gradient parameter ηi=(d log Ti/dr)/(d log n/dr) varies strongly from high (>4 at density pedestal top/shoulder) to low (<2 in the density slope) values. Variation of density and ηi is in the same scale as the turbulence correlation length, compressing the turbulence in the density slope region. The resulting ion thermal flux is on the order of experimentally inferred values. The present study strongly suggests that a localized estimate of the ITG-driven χi will not be valid due to the nonlocal dynamics of the compressed turbulence in an L-mode-type density slope. While the thermal transport and the temperature profile saturate quickly, the E×B rotation shows a longer time damping during the turbulence. In addition, a radially in-out mean potential variation is observed.}, number={5}, journal={PHYSICS OF PLASMAS}, author={Chang, C. S. and Ku, S. and Diamond, P. H. and Lin, Z. and Parker, S. and Hahm, T. S. and Samatova, N.}, year={2009}, month={May} } @article{belnap_pan_verberkmoes_power_samatova_carver_hettich_banfield_2009, title={Cultivation and quantitative proteomic analyses of acidophilic microbial communities}, volume={4}, ISSN={1751-7362 1751-7370}, url={http://dx.doi.org/10.1038/ismej.2009.139}, DOI={10.1038/ismej.2009.139}, abstractNote={Abstract Acid mine drainage (AMD), an extreme environment characterized by low pH and high metal concentrations, can support dense acidophilic microbial biofilm communities that rely on chemoautotrophic production based on iron oxidation. Field determined production rates indicate that, despite the extreme conditions, these communities are sufficiently well adapted to their habitats to achieve primary production rates comparable to those of microbial communities occurring in some non-extreme environments. To enable laboratory studies of growth, production and ecology of AMD microbial communities, a culturing system was designed to reproduce natural biofilms, including organisms recalcitrant to cultivation. A comprehensive metabolic labeling-based quantitative proteomic analysis was used to verify that natural and laboratory communities were comparable at the functional level. Results confirmed that the composition and core metabolic activities of laboratory-grown communities were similar to a natural community, including the presence of active, low abundance bacteria and archaea that have not yet been isolated. However, laboratory growth rates were slow compared with natural communities, and this correlated with increased abundance of stress response proteins for the dominant bacteria in laboratory communities. Modification of cultivation conditions reduced the abundance of stress response proteins and increased laboratory community growth rates. The research presented here represents the first description of the application of a metabolic labeling-based quantitative proteomic analysis at the community level and resulted in a model microbial community system ideal for testing physiological and ecological hypotheses.}, number={4}, journal={The ISME Journal}, publisher={Springer Science and Business Media LLC}, author={Belnap, Christopher P and Pan, Chongle and VerBerkmoes, Nathan C and Power, Mary E and Samatova, Nagiza F and Carver, Rudolf L and Hettich, Robert L and Banfield, Jillian F}, year={2009}, month={Dec}, pages={520–530} } @inproceedings{awekar_samatova_2009, title={Fast matching for all pairs similarity search}, DOI={10.1109/wi-iat.2009.52}, abstractNote={All pairs similarity search is the problem of finding all pairs of records that have a similarity score above the specified threshold. Many real-world systems like search engines, online social networks, and digital libraries frequently have to solve this problem for datasets having millions of records in a high dimensional space, which are often sparse. The challenge is to design algorithms with feasible time requirements. To meet this challenge, algorithms have been proposed based on the inverted index, which maps each dimension to a list of records with non-zero projection along that dimension. Common to these algorithms is a three-phase framework of data preprocessing, pairs matching, and indexing. Matching is the most time-consuming phase. Within this framework, we propose fast matching technique that uses the sparse nature of real-world data to effectively reduce the size of the search space through a systematic set of tighter filtering conditions and heuristic optimizations. We integrate our technique with the fastest-to-date algorithm in the field and achieve up to 6.5X speed-up on three large real-world datasets.}, booktitle={2009 IEEE/WIC/ACM International Joint Conferences on Web Intelligence (WI) and Intelligent Agent Technologies (IAT), vol 1}, author={Awekar, A. and Samatova, N. F.}, year={2009}, pages={295–300} } @article{yang_pappas_hauser_land_chen_hurst_pan_kouvelis_typas_pelletier_et al._2009, title={Improved genome annotation for Zymomonas mobilis}, volume={27}, ISSN={1087-0156 1546-1696}, url={http://dx.doi.org/10.1038/nbt1009-893}, DOI={10.1038/nbt1009-893}, number={10}, journal={Nature Biotechnology}, publisher={Springer Science and Business Media LLC}, author={Yang, Shihui and Pappas, Katherine M and Hauser, Loren J and Land, Miriam L and Chen, Gwo-Liang and Hurst, Gregory B and Pan, Chongle and Kouvelis, Vassili N and Typas, Milton A and Pelletier, Dale A and et al.}, year={2009}, month={Oct}, pages={893–894} } @article{chen_schmidt_samatova_2009, title={On parameterized complexity of the Multi-MCS problem}, volume={410}, ISSN={["1879-2294"]}, DOI={10.1016/j.tcs.2008.12.060}, abstractNote={We introduce the maximum common subgraph problem for multiple graphs (Multi-MCS) inspired by various biological applications such as multiple alignments of gene sequences, protein structures, metabolic pathways, or protein–protein interaction networks. Multi-MCS is a generalization of the two-graph Maximum Common Subgraph problem (MCS). On the basis of the framework of parameterized complexity theory, we derive the parameterized complexity of Multi-MCS for various parameters for different classes of graphs. For example, for directed graphs with labeled vertices, we prove that the parameterized m-Multi-MCS problem is W[2]-hard, while the parameterized k-Multi-MCS problem is W[t]-hard (∀t≥1), where m and k are the size of the maximum common subgraph and the number of multiple graphs, respectively. We show similar results for other parameterized versions of the Multi-MCS problem for directed graphs with vertex labels and undirected graphs with vertex and edge labels by giving linear FPT reductions of the problems from parameterized versions of the longest common subsequence problem. Likewise, for unlabeled undirected graphs, we show that a parameterized version of the Multi-MCS problem with a fixed number of input graphs is W[1]-complete by showing a linear FPT reduction to and from a parameterized version of the maximum clique problem.}, number={21-23}, journal={THEORETICAL COMPUTER SCIENCE}, author={Chen, Wenbin and Schmidt, Matthew C. and Samatova, Nagiza F.}, year={2009}, month={May}, pages={2024–2032} } @article{chen_schmidt_samatova_2011, title={On the parameterized complexity of the Multi-MCT and Multi-MCST problems}, volume={21}, ISSN={["1573-2886"]}, DOI={10.1007/s10878-009-9220-2}, number={2}, journal={JOURNAL OF COMBINATORIAL OPTIMIZATION}, author={Chen, Wenbin and Schmidt, Matthew C. and Samatova, Nagiza F.}, year={2011}, month={Feb}, pages={151–158} } @inbook{lin_ma_li_yu_samatova_2008, title={Adaptive Request Scheduling for Parallel Scientific Web Services}, ISBN={9783540694762 9783540694977}, ISSN={0302-9743 1611-3349}, url={http://dx.doi.org/10.1007/978-3-540-69497-7_19}, DOI={10.1007/978-3-540-69497-7_19}, abstractNote={Scientific web services often possess data models and query workloads quite different from commercial ones and are much less studied. Individual queries have to be processed in parallel by multiple server nodes, due to the computation- and data-intensiveness of the processing. Meanwhile, each query is performed against portions of a large, common dataset. Existing scheduling policies from traditional environments (namely cluster web servers and supercomputers) consider only the data or the computation aspect alone and are therefore inadequate for this new type of workload. In this paper, we systematically investigate adaptive scheduling for scientific web services, by taking into account parallel computation scalability, data locality, and load balancing. Our case study focuses on high-throughput query processing on biological sequence databases, a fundamental task performed daily by millions of scientists, who increasingly prefer to use web services powered by parallel servers. Our research indicates that intelligent resource allocation and scheduling are crucial in improving the overall performance of a parallel sequence database search server. Failure to consider either the parallel computation scalability or the data locality issues can significantly hurt the system throughput and query response time. Also, no single static strategy works best for all request workloads or all resources settings. In response, we present several dynamic scheduling techniques that automatically adapt to the request workload and system configuration in making scheduling decisions. Experiments on a cluster using 32 processors show the combination of these techniques delivers a several-fold improvement in average query response time across various workloads.}, booktitle={Lecture Notes in Computer Science}, publisher={Springer Berlin Heidelberg}, author={Lin, Heshan and Ma, Xiaosong and Li, Jiangtian and Yu, Ting and Samatova, Nagiza}, year={2008}, month={Aug}, pages={276–294} } @article{pan_oda_lankford_zhang_samatova_pelletier_harwood_hettich_2008, title={Characterization of anaerobic catabolism of p-coumarate in Rhodopseudomonas palustris by integrating transcriptomics and quantitative proteomics}, volume={7}, ISSN={["1535-9484"]}, DOI={10.1074/mcp.M700147-MCP200}, abstractNote={In this study, the pathway for anaerobic catabolism of p-coumarate by a model bacterium, Rhodopseudomonas palustris, was characterized by comparing the gene expression profiles of cultures grown in the presence of p-coumarate, benzoate, or succinate as the sole carbon sources. Gene expression was quantified at the mRNA level with transcriptomics and at the protein level with quantitative proteomics using 15N metabolic labeling. Protein relative abundances, along with their confidence intervals for statistical significance evaluation, were estimated with the software ProRata. Both -omics measurements were used as the transcriptomics provided near-full genome coverage of gene expression profiles and the quantitative proteomics ascertained abundance changes of over 1600 proteins. The integrated gene expression data are consistent with the hypothesis that p-coumarate is converted to benzoyl-CoA, which is then degraded via a known aromatic ring reduction pathway. For the metabolism of p-coumarate to benzoyl-CoA, two alternative routes, a β-oxidation route and a non-β-oxidation route, are possible. The integrated gene expression data provided strong support for the non-β-oxidation route in R. palustris. A putative gene was proposed for every step in the non-β-oxidation route.}, number={5}, journal={MOLECULAR & CELLULAR PROTEOMICS}, author={Pan, Chongle and Oda, Yasuhiro and Lankford, Patricia K. and Zhang, Bing and Samatova, Nagiza F. and Pelletier, Dale A. and Harwood, Caroline S. and Hettich, Robert L.}, year={2008}, month={May}, pages={938–948} } @article{zhang_park_karpinets_samatova_2008, title={From pull-down data to protein interaction networks and complexes with biological relevance}, volume={24}, ISSN={["1460-2059"]}, DOI={10.1093/bioinformatics/btn036}, abstractNote={Abstract Motivation:  Recent improvements in high-throughput Mass Spectrometry (MS) technology have expedited genome-wide discovery of protein–protein interactions by providing a capability of detecting protein complexes in a physiological setting. Computational inference of protein interaction networks and protein complexes from MS data are challenging. Advances are required in developing robust and seamlessly integrated procedures for assessment of protein–protein interaction affinities, mathematical representation of protein interaction networks, discovery of protein complexes and evaluation of their biological relevance. Results: A multi-step but easy-to-follow framework for identifying protein complexes from MS pull-down data is introduced. It assesses interaction affinity between two proteins based on similarity of their co-purification patterns derived from MS data. It constructs a protein interaction network by adopting a knowledge-guided threshold selection method. Based on the network, it identifies protein complexes and infers their core components using a graph-theoretical approach. It deploys a statistical evaluation procedure to assess biological relevance of each found complex. On Saccharomyces cerevisiae pull-down data, the framework outperformed other more complicated schemes by at least 10% in F1-measure and identified 610 protein complexes with high-functional homogeneity based on the enrichment in Gene Ontology (GO) annotation. Manual examination of the complexes brought forward the hypotheses on cause of false identifications. Namely, co-purification of different protein complexes as mediated by a common non-protein molecule, such as DNA, might be a source of false positives. Protein identification bias in pull-down technology, such as the hydrophilic bias could result in false negatives. Contact:  samatovan@ornl.gov Supplementary information: Supplementary data are available at Bioinformatics online.}, number={7}, journal={BIOINFORMATICS}, author={Zhang, Bing and Park, Byung-Hoon and Karpinets, Tatiana and Samatova, Nagiza F.}, year={2008}, month={Apr}, pages={979–986} } @inbook{abu-khzam_rizk_abdallah_samatova_2008, title={The Buffered Work-Pool Approach for Search-Tree Based Optimization Algorithms}, ISBN={9783540681052 9783540681113}, url={http://dx.doi.org/10.1007/978-3-540-68111-3_19}, DOI={10.1007/978-3-540-68111-3_19}, booktitle={Parallel Processing and Applied Mathematics}, publisher={Springer Berlin Heidelberg}, author={Abu-Khzam, Faisal N. and Rizk, Mohamad A. and Abdallah, Deema A. and Samatova, Nagiza F.}, year={2008}, month={May}, pages={170–179} } @article{park_ostrouchov_samatova_2007, title={Sampling streaming data with replacement}, volume={52}, ISSN={0167-9473}, url={http://dx.doi.org/10.1016/j.csda.2007.03.010}, DOI={10.1016/j.csda.2007.03.010}, abstractNote={Simple random sampling is a widely accepted basis for estimation from a population. When data come as a stream, the total population size continuously grows and only one pass through the data is possible. Reservoir sampling is a method of maintaining a fixed size random sample from streaming data. Reservoir sampling without replacement has been extensively studied and several algorithms with sub-linear time complexity exist. Although reservoir sampling with replacement is previously mentioned by some authors, it has been studied very little and only linear algorithms exist. A with-replacement reservoir sampling algorithm of sub-linear time complexity is introduced. A thorough complexity analysis of several approaches to the with-replacement reservoir sampling problem is also provided.}, number={2}, journal={Computational Statistics & Data Analysis}, publisher={Elsevier BV}, author={Park, Byung-Hoon and Ostrouchov, George and Samatova, Nagiza F.}, year={2007}, month={Oct}, pages={750–762} } @article{zhang_verberkmoes_langston_uberbacher_hettich_samatova_2006, title={Detecting Differential and Correlated Protein Expression in Label-Free Shotgun Proteomics}, volume={5}, ISSN={1535-3893 1535-3907}, url={http://dx.doi.org/10.1021/pr0600273}, DOI={10.1021/pr0600273}, abstractNote={Recent studies have revealed a relationship between protein abundance and sampling statistics, such as sequence coverage, peptide count, and spectral count, in label-free liquid chromatography-tandem mass spectrometry (LC-MS/MS) shotgun proteomics. The use of sampling statistics offers a promising method of measuring relative protein abundance and detecting differentially expressed or coexpressed proteins. We performed a systematic analysis of various approaches to quantifying differential protein expression in eukaryotic Saccharomyces cerevisiae and prokaryotic Rhodopseudomonas palustris label-free LC-MS/MS data. First, we showed that, among three sampling statistics, the spectral count has the highest technical reproducibility, followed by the less-reproducible peptide count and relatively nonreproducible sequence coverage. Second, we used spectral count statistics to measure differential protein expression in pairwise experiments using five statistical tests: Fisher's exact test, G-test, AC test, t-test, and LPE test. Given the S. cerevisiae data set with spiked proteins as a benchmark and the false positive rate as a metric, our evaluation suggested that the Fisher's exact test, G-test, and AC test can be used when the number of replications is limited (one or two), whereas the t-test is useful with three or more replicates available. Third, we generalized the G-test to increase the sensitivity of detecting differential protein expression under multiple experimental conditions. Out of 1622 identified R. palustris proteins in the LC-MS/MS experiment, the generalized G-test detected 1119 differentially expressed proteins under six growth conditions. Finally, we studied correlated expression of these 1119 proteins by analyzing pairwise expression correlations and by delineating protein clusters according to expression patterns. Through pairwise expression correlation analysis, we demonstrated that proteins co-located in the same operon were much more strongly coexpressed than those from different operons. Combining cluster analysis with existing protein functional annotations, we identified six protein clusters with known biological significance. In summary, the proposed generalized G-test using spectral count sampling statistics is a viable methodology for robust quantification of relative protein abundance and for sensitive detection of biologically significant differential protein expression under multiple experimental conditions in label-free shotgun proteomics.}, number={11}, journal={Journal of Proteome Research}, publisher={American Chemical Society (ACS)}, author={Zhang, Bing and VerBerkmoes, Nathan C. and Langston, Michael A. and Uberbacher, Edward and Hettich, Robert L. and Samatova, Nagiza F.}, year={2006}, month={Nov}, pages={2909–2918} } @article{austin_allen_mccollum_dar_wilgus_sayler_samatova_cox_simpson_2006, title={Gene network shaping of inherent noise spectra}, volume={439}, ISSN={0028-0836 1476-4687}, url={http://dx.doi.org/10.1038/nature04194}, DOI={10.1038/nature04194}, abstractNote={Recent work demonstrates that stochastic fluctuations in molecular populations have consequences for gene regulation. Previous experiments focused on noise sources or noise propagation through gene networks by measuring noise magnitudes. However, in theoretical analysis, we showed that noise frequency content is determined by the underlying gene circuits, leading to a mapping between gene circuit structure and the noise frequency range. An intriguing prediction from our previous studies was that negative autoregulation shifts noise to higher frequencies where it is more easily filtered out by gene networks--a property that may contribute to the prevalence of autoregulation motifs (for example, found in the regulation of approximately 40% of Escherichia coli genes). Here we measure noise frequency content in growing cultures of E. coli, and verify the link between gene circuit structure and noise spectra by demonstrating the negative autoregulation-mediated spectral shift. We further demonstrate that noise spectral measurements provide mechanistic insights into gene regulation, as perturbations of gene circuit parameters are discernible in the measured noise frequency ranges. These results suggest that noise spectral measurements could facilitate the discovery of novel regulatory relationships.}, number={7076}, journal={Nature}, publisher={Springer Science and Business Media LLC}, author={Austin, D. W. and Allen, M. S. and McCollum, J. M. and Dar, R. D. and Wilgus, J. R. and Sayler, G. S. and Samatova, N. F. and Cox, C. D. and Simpson, M. L.}, year={2006}, month={Feb}, pages={608–611} } @article{pan_kora_mcdonald_tabb_verberkmoes_hurst_pelletier_samatova_hettich_2006, title={ProRata:  A Quantitative Proteomics Program for Accurate Protein Abundance Ratio Estimation with Confidence Interval Evaluation}, volume={78}, ISSN={0003-2700 1520-6882}, url={http://dx.doi.org/10.1021/ac060654b}, DOI={10.1021/ac060654b}, abstractNote={A profile likelihood algorithm is proposed for quantitative shotgun proteomics to infer the abundance ratios of proteins from the abundance ratios of isotopically labeled peptides derived from proteolysis. Previously, we have shown that the estimation variability and bias of peptide abundance ratios can be predicted from their profile signal-to-noise ratios. Given multiple quantified peptides for a protein, the profile likelihood algorithm probabilistically weighs the peptide abundance ratios by their inferred estimation variability, accounts for their expected estimation bias, and suppresses contribution from outliers. This algorithm yields maximum likelihood point estimation and profile likelihood confidence interval estimation of protein abundance ratios. This point estimator is more accurate than an estimator based on the average of peptide abundance ratios. The confidence interval estimation provides an "error bar" for each protein abundance ratio that reflects its estimation precision and statistical uncertainty. The accuracy of the point estimation and the precision and confidence level of the interval estimation were benchmarked with standard mixtures of isotopically labeled proteomes. The profile likelihood algorithm was integrated into a quantitative proteomics program, called ProRata, freely available at www.MSProRata.org.}, number={20}, journal={Analytical Chemistry}, publisher={American Chemical Society (ACS)}, author={Pan, Chongle and Kora, Guruprasad and McDonald, W. Hayes and Tabb, David L. and VerBerkmoes, Nathan C. and Hurst, Gregory B. and Pelletier, Dale A. and Samatova, Nagiza F. and Hettich, Robert L.}, year={2006}, month={Oct}, pages={7121–7131} } @article{pan_kora_tabb_pelletier_mcdonald_hurst_hettich_samatova_2006, title={Robust Estimation of Peptide Abundance Ratios and Rigorous Scoring of Their Variability and Bias in Quantitative Shotgun Proteomics}, volume={78}, ISSN={0003-2700 1520-6882}, url={http://dx.doi.org/10.1021/ac0606554}, DOI={10.1021/ac0606554}, abstractNote={The abundance ratio between the light and heavy isotopologues of an isotopically labeled peptide can be estimated from their selected ion chromatograms. However, quantitative shotgun proteomics measurements yield selected ion chromatograms at highly variable signal-to-noise ratios for tens of thousands of peptides. This challenge calls for algorithms that not only robustly estimate the abundance ratios of different peptides but also rigorously score each abundance ratio for the expected estimation bias and variability. Scoring of the abundance ratios, much like scoring of sequence assignment for tandem mass spectra by peptide identification algorithms, enables filtering of unreliable peptide quantification and use of formal statistical inference in the subsequent protein abundance ratio estimation. In this study, a parallel paired covariance algorithm is used for robust peak detection in selected ion chromatograms. A peak profile is generated for each peptide, which is a scatterplot of ion intensities measured for the two isotopologues within their chromatographic peaks. Principal component analysis of the peak profile is proposed to estimate the peptide abundance ratio and to score the estimation with the signal-to-noise ratio of the peak profile (profile signal-to-noise ratio). We demonstrate that the profile signal-to-noise ratio is inversely correlated with the variability and bias of peptide abundance ratio estimation.}, number={20}, journal={Analytical Chemistry}, publisher={American Chemical Society (ACS)}, author={Pan, Chongle and Kora, Guruprasad and Tabb, David L. and Pelletier, Dale A. and McDonald, W. Hayes and Hurst, Gregory B. and Hettich, Robert L. and Samatova, Nagiza F.}, year={2006}, month={Oct}, pages={7110–7120} } @article{mccollum_peterson_cox_simpson_samatova_2006, title={The sorting direct method for stochastic simulation of biochemical systems with varying reaction execution behavior}, volume={30}, ISSN={1476-9271}, url={http://dx.doi.org/10.1016/j.compbiolchem.2005.10.007}, DOI={10.1016/j.compbiolchem.2005.10.007}, abstractNote={A key to advancing the understanding of molecular biology in the post-genomic age is the development of accurate predictive models for genetic regulation, protein interaction, metabolism, and other biochemical processes. To facilitate model development, simulation algorithms must provide an accurate representation of the system, while performing the simulation in a reasonable amount of time. Gillespie's stochastic simulation algorithm (SSA) accurately depicts spatially homogeneous models with small populations of chemical species and properly represents noise, but it is often abandoned when modeling larger systems because of its computational complexity. In this work, we examine the performance of different versions of the SSA when applied to several biochemical models. Through our analysis, we discover that transient changes in reaction execution frequencies, which are typical of biochemical models with gene induction and repression, can dramatically affect simulator performance. To account for these shifts, we propose a new algorithm called the sorting direct method that maintains a loosely sorted order of the reactions as the simulation executes. Our measurements show that the sorting direct method performs favorably when compared to other well-known exact stochastic simulation algorithms.}, number={1}, journal={Computational Biology and Chemistry}, publisher={Elsevier BV}, author={McCollum, James M. and Peterson, Gregory D. and Cox, Chris D. and Simpson, Michael L. and Samatova, Nagiza F.}, year={2006}, month={Feb}, pages={39–49} } @inbook{suters_abu-khzam_zhang_symons_samatova_langston_2005, title={A New Approach and Faster Exact Methods for the Maximum Common Subgraph Problem}, ISBN={9783540280613 9783540318064}, ISSN={0302-9743 1611-3349}, url={http://dx.doi.org/10.1007/11533719_73}, DOI={10.1007/11533719_73}, abstractNote={The Maximum Common Subgraph (MCS) problem appears in many guises and in a wide variety of applications. The usual goal is to take as inputs two graphs, of order m and n, respectively, and find the largest induced subgraph contained in both of them. MCS is frequently solved by reduction to the problem of finding a maximum clique in the order mn association graph, which is a particular form of product graph built from the inputs. In this paper a new algorithm, termed "clique branching," is proposed that exploits a special structure inherent in the association graph. This structure contains a large number of naturally-ordered cliques that are present in the association graph's complement. A detailed analysis shows that the proposed algorithm requires O((m+1) n ) time, which is a superior worst-case bound to those known for previously-analyzed algorithms in the setting of the MCS problem.}, booktitle={Lecture Notes in Computer Science}, publisher={Springer Berlin Heidelberg}, author={Suters, W. Henry and Abu-Khzam, Faisal N. and Zhang, Yun and Symons, Christopher T. and Samatova, Nagiza F. and Langston, Michael A.}, year={2005}, pages={717–727} } @article{yu_park_chandramohan_munavalli_geist_samatova_2005, title={In silico Discovery of Enzyme–Substrate Specificity-determining Residue Clusters}, volume={352}, ISSN={0022-2836}, url={http://dx.doi.org/10.1016/j.jmb.2005.08.008}, DOI={10.1016/j.jmb.2005.08.008}, abstractNote={The binding between an enzyme and its substrate is highly specific, despite the fact that many different enzymes show significant sequence and structure similarity. There must be, then, substrate specificity-determining residues that enable different enzymes to recognize their unique substrates. We reason that a coordinated, not independent, action of both conserved and non-conserved residues determine enzymatic activity and specificity. Here, we present a surface patch ranking (SPR) method for in silico discovery of substrate specificity-determining residue clusters by exploring both sequence conservation and correlated mutations. As case studies we apply SPR to several highly homologous enzymatic protein pairs, such as guanylyl versus adenylyl cyclases, lactate versus malate dehydrogenases, and trypsin versus chymotrypsin. Without using experimental data, we predict several single and multi-residue clusters that are consistent with previous mutagenesis experimental results. Most single-residue clusters are directly involved in enzyme–substrate interactions, whereas multi-residue clusters are vital for domain–domain and regulator–enzyme interactions, indicating their complementary role in specificity determination. These results demonstrate that SPR may help the selection of target residues for mutagenesis experiments and, thus, focus rational drug design, protein engineering, and functional annotation to the relevant regions of a protein.}, number={5}, journal={Journal of Molecular Biology}, publisher={Elsevier BV}, author={Yu, Gong-Xin and Park, Byung-Hoon and Chandramohan, Praveen and Munavalli, Rajesh and Geist, Al and Samatova, Nagiza F.}, year={2005}, month={Oct}, pages={1105–1117} } @inproceedings{jenkins_arkatkar_lakshminarasimhan_boyuka_schendel_shah_ethier_chang_chen_kolla_et al., title={ALACRITY: Analytics-driven lossless data compression for rapid in-situ indexing, storing, and querying}, volume={8220}, booktitle={Transactions on large-scale data- and knowledge- centered systems x: special issue on database- and expert-systems applications}, author={Jenkins, J. and Arkatkar, I. and Lakshminarasimhan, S. and Boyuka, D. A. and Schendel, E. R. and Shah, N. and Ethier, S. and Chang, C. S. and Chen, J. and Kolla, H. and et al.}, pages={95–114} } @book{ranshous_shen_koutra_faloutsos_samatova, title={Anomaly detection in dynamic networks: A survey}, journal={Technical Report- Not held in TRLN member libraries}, author={Ranshous, S. and Shen, S. and Koutra, D. and Faloutsos, C. and Samatova, N. F.} } @article{padmanabhan_nudelman_harenberg_bello_sohn_shpanskaya_dikshit_yerramsetty_tanzi_saykin_et al., title={Characterizing gene and protein crosstalks in subjects at risk of developing Alzheimer's disease: A new computational approach}, volume={5}, number={3}, journal={Processes}, author={Padmanabhan, K. and Nudelman, K. and Harenberg, S. and Bello, G. and Sohn, D. and Shpanskaya, K. and Dikshit, P. T. and Yerramsetty, P. S. and Tanzi, R. E. and Saykin, A. J. and et al.} } @book{harenberg_bello_gjeltema_ranshous_harlalka_seay_padmanabhan_samatova, title={Community detection in large-scale networks: A Survey and empirical evaluation}, journal={Technical Report- Not held in TRLN member libraries}, author={Harenberg, S. and Bello, G. A. and Gjeltema, L. and Ranshous, S. and Harlalka, J. and Seay, R. and Padmanabhan, K. and Samatova, N.}, pages={2014} } @book{lakshminarasimhan_shah_ethier_klasky_latham_ross_n.f., title={Compressing the Incompressible with ISABELA: In-situ Reduction of Spatio-Temporal Data}, journal={Technical Report- Not held in TRLN member libraries}, author={Lakshminarasimhan, S. and Shah, N. and Ethier, S.J and Klasky, S. and Latham, R. and Ross, R. and N.F., Samatova} } @inproceedings{samatova_schmidt_hendrix_breimyer_thomas_park, title={Coupling graph perturbation theory with scalable parallel algorithms for large-scale enumeration of maximal cliques in biological graphs - art. no. 012053}, volume={125}, booktitle={Scidac 2008: Scientific discovery through advanced computing}, author={Samatova, N. F. and Schmidt, M. C. and Hendrix, W. and Breimyer, P. and Thomas, K. and Park, B. H.}, pages={12053–12053} } @book{chen_pansombut_hendrix_gonzalez_semazzi_choudhary_kumar_melechko_samatova, title={Forecaster: Forecast Oriented Feature Elimination-based Classification of Adverse Spatio-Temporal Extremes}, author={Chen, Z. and Pansombut, T. and Hendrix, W. and Gonzalez, D. and Semazzi, F. and Choudhary, A. and Kumar, V. and Melechko, A.V. and Samatova, N.F.} } @book{jenkins_zou_tang_kimpe_ross_samatova, title={Parallel data layout optimization of scientific data through access-driven replication}, journal={Technical Report- Not held in TRLN member libraries}, author={Jenkins, J. P. and Zou, X. and Tang, H. and Kimpe, D. and Ross, R. and Samatova, N. F.} }