@article{bahmani_mueller_2017, title={Scalable communication event tracing via clustering}, volume={109}, ISSN={["1096-0848"]}, DOI={10.1016/j.jpdc.2017.06.008}, abstractNote={Communication traces help developers of high-performance computing (HPC) applications understand and improve their codes. When run on large-scale HPC facilities, the scalability of tracing tools becomes a challenge. To address this problem, traces can be clustered into groups of processes that exhibit similar behavior. Instead of collecting trace information of each individual node, it then suffices to collect a trace of a small set of representative nodes, namely one per cluster. However, clustering algorithms themselves need to have low overhead, be scalable, and adapt to application characteristics. We devised an adaptive clustering algorithm for large-scale applications called ACURDION that traces the MPI communication of code with O(log P) time complexity. First, ACURDION identifies the parameters that differ across processes by using a logarithmic algorithm called Adaptive Signature Building. Second, it clusters the processes based on those parameters. Experiments show that collecting traces of just nine nodes/clusters suffices to capture the communication behavior of all nodes for a wide set of HPC benchmarks codes while retaining sufficient accuracy of trace events and parameters. In summary, ACURDION improves trace scalability and automation over prior approaches.}, journal={JOURNAL OF PARALLEL AND DISTRIBUTED COMPUTING}, author={Bahmani, Amir and Mueller, Frank}, year={2017}, month={Nov}, pages={230–244} } @article{bahmani_mueller_2016, title={Efficient clustering for ultra-scale application tracing}, volume={98}, ISSN={["1096-0848"]}, DOI={10.1016/j.jpdc.2016.08.001}, abstractNote={Extreme-scale computing poses a number of challenges to application performance. Developers need to study application behavior by collecting detailed information with the help of tracing toolsets to determine shortcomings. But not only applications are “scalability challenged”, current tracing toolsets also fall short of exascale requirements for low background overhead since trace collection for each execution entity is becoming infeasible. One effective solution is to cluster processes with the same behavior into groups. Instead of collecting performance information from each individual node, this information can be collected from just a set of representative nodes. This work contributes a fast, scalable, signature-based clustering algorithm that clusters processes exhibiting similar execution behavior. Instead of prior work based on statistical clustering, our approach produces precise results nearly without loss of events or accuracy. The proposed algorithm combines low overhead at the clustering level with log(P) time complexity, and it splits the merge process to make tracing suitable for extreme-scale computing. Overall, this multi-level precise clustering based on signatures further generalizes to a novel multi-metric clustering technique with unprecedented low overhead.}, journal={JOURNAL OF PARALLEL AND DISTRIBUTED COMPUTING}, author={Bahmani, Amir and Mueller, Frank}, year={2016}, month={Dec}, pages={25–39} } @article{bahmani_sibley_parsian_owzar_mueller_2016, title={SparkScore: Leveraging Apache Spark for Distributed Genomic Inference}, ISSN={["2164-7062"]}, DOI={10.1109/ipdpsw.2016.6}, abstractNote={The method of the efficient score statistic is used extensively to conduct inference for high throughput genomic data due to its computational efficiency and abilityto accommodate simple and complex phenotypes. Inference based on these statistics can readily incorporate a priori knowledge from a vast collection of bioinformatics databases to further refine the analyses. The sampling distribution of the efficient score statistic is typically approximated using asymptotics. As this may be inappropriate in the context of small study size, or uncommon or rare variants, resampling methods are often used to approximate the exact sampling distribution. We propose SparkScore, a set of distributed computational algorithms implemented in Apache Spark, to leverage the embarrassingly parallel nature of genomic resampling inference on the basis of the efficient score statistics. We illustrate the application of this computational approachfor the analysis of data from genome-wide analysis studies(GWAS). This computational approach also harnesses thefault-tolerant features of Spark and can be readily extended to analysis of DNA and RNA sequencing data, including expression quantitative trait loci (eQTL) and phenotype association studies.}, journal={2016 IEEE 30TH INTERNATIONAL PARALLEL AND DISTRIBUTED PROCESSING SYMPOSIUM WORKSHOPS (IPDPSW)}, author={Bahmani, Amir and Sibley, Alexander B. and Parsian, Mahmoud and Owzar, Kouros and Mueller, Frank}, year={2016}, pages={435–442} }