@article{noeth_ratn_mueller_schulz_supinski_2009, title={ScalaTrace: Scalable compression and replay of communication traces for high-performance computing}, volume={69}, ISSN={["1096-0848"]}, DOI={10.1016/j.jpdc.2008.09.001}, abstractNote={Characterizing the communication behavior of large-scale applications is a difficult and costly task due to code/system complexity and long execution times. While many tools to study this behavior have been developed, these approaches either aggregate information in a lossy way through high-level statistics or produce huge trace files that are hard to handle. We contribute an approach that provides orders of magnitude smaller, if not near-constant size, communication traces regardless of the number of nodes while preserving structural information. We introduce intra- and inter-node compression techniques of MPI events that are capable of extracting an application’s communication structure. We further present a replay mechanism for the traces generated by our approach and discuss results of our implementation for BlueGene/L. Given this novel capability, we discuss its impact on communication tuning and beyond. To the best of our knowledge, such a concise representation of MPI traces in a scalable manner combined with deterministic MPI call replay is without any precedent.}, number={8}, journal={JOURNAL OF PARALLEL AND DISTRIBUTED COMPUTING}, author={Noeth, Michael and Ratn, Prasun and Mueller, Frank and Schulz, Martin and Supinski, Bronis R.}, year={2009}, month={Aug}, pages={696–710} }