@article{wu_mueller_2012, title={SCALAEXTRAP: Trace-Based Communication Extrapolation for SPMD Programs}, volume={34}, ISSN={["1558-4593"]}, DOI={10.1145/2160910.2160914}, abstractNote={Performance modeling for scientific applications is important for assessing potential application performance and systems procurement in high-performance computing (HPC). Recent progress on communication tracing opens up novel opportunities for communication modeling due to its lossless yet scalable trace collection. Estimating the impact of scaling on communication efficiency still remains nontrivial due to execution-time variations and exposure to hardware and software artifacts.}, number={1}, journal={ACM TRANSACTIONS ON PROGRAMMING LANGUAGES AND SYSTEMS}, author={Wu, Xing and Mueller, Frank}, year={2012}, month={Apr} } @article{wu_deshpande_mueller_2012, title={ScalaBenchGen: Auto-Generation of Communication Benchmarks Traces}, ISSN={["1530-2075"]}, DOI={10.1109/ipdps.2012.114}, abstractNote={Benchmarks are essential for evaluating HPC hardware and software for petascale machines and beyond. But benchmark creation is a tedious manual process. As a result, benchmarks tend to lag behind the development of complex scientific codes. This work contributes an automated approach to the creation of communication benchmarks. Given an MPI application, we utilize Scala Trace, a loss less and scalable framework to trace communication operations and execution time while abstracting away the computations. A single trace file that reflects the behavior of all nodes is subsequently expanded to C source code by a novel code generator. This resulting benchmark code is compact, portable, human-readable, and accurately reflects the original application's communication characteristics and runtime characteristics. Experimental results demonstrate that generated source code of benchmarks preserves both the communication patterns and the wall clock-time behavior of the original application. Such automatically generated benchmarks not only shorten the transition from application development to benchmark extraction but also facilitate code obfuscation, which is essential for benchmark extraction from commercial and restricted applications.}, journal={2012 IEEE 26TH INTERNATIONAL PARALLEL AND DISTRIBUTED PROCESSING SYMPOSIUM (IPDPS)}, author={Wu, Xing and Deshpande, Vivek and Mueller, Frank}, year={2012}, pages={1250–1260} } @inproceedings{mueller_wu_schulz_supinski_gamblin_2012, title={ScalaTrace: Tracing, analysis and modeling of HPC codes at scale}, volume={7134}, booktitle={Applied parallel and scientific computing, pt ii}, author={Mueller, F. and Wu, X. and Schulz, M. and Supinski, B. R. and Gamblin, T.}, year={2012}, pages={410–418} } @article{wu_mueller_2011, title={ScalaExtrap: Trace-Based Communication Extrapolation for SPMD Programs}, volume={46}, ISSN={["1558-1160"]}, DOI={10.1145/2038037.1941569}, abstractNote={Performance modeling for scientific applications is important for assessing potential application performance and systems procurement in high-performance computing (HPC). Recent progress on communication tracing opens up novel opportunities for communication modeling due to its lossless yet scalable trace collection. Estimating the impact of scaling on communication efficiency still remains non-trivial due to execution-time variations and exposure to hardware and software artifacts. This work contributes a fundamentally novel modeling scheme. We synthetically generate the application trace for large numbers of nodes by extrapolation from a set of smaller traces. We devise an innovative approach for topology extrapolation of single program, multiple data (SPMD) codes with stencil or mesh communication. The extrapolated trace can subsequently be (a) replayed to assess communication requirements before porting an application, (b) transformed to auto-generate communication benchmarks for various target platforms, and (c) analyzed to detect communication inefficiencies and scalability limitations. To the best of our knowledge, rapidly obtaining the communication behavior of parallel applications at arbitrary scale with the availability of timed replay, yet without actual execution of the application at this scale is without precedence and has the potential to enable otherwise infeasible system simulation at the exascale level.}, number={8}, journal={ACM SIGPLAN NOTICES}, author={Wu, Xing and Mueller, Frank}, year={2011}, month={Aug}, pages={113–122} }