@article{marathe_thakkar_mueller_2010, title={Feedback-directed page placement for ccNUMA via hardware-generated memory traces}, volume={70}, ISSN={["1096-0848"]}, DOI={10.1016/j.jpdc.2010.08.015}, abstractNote={Non-uniform memory architectures with cache coherence (ccNUMA) are becoming increasingly common, not just for large-scale high performance platforms but also in the context of multi-core architectures. Under ccNUMA, data placement may influence overall application performance significantly as references resolved locally to a processor/core impose lower latencies than remote ones. This work develops a novel hardware-assisted page placement paradigm based on automated tracing of the memory references made by application threads. Two placement schemes, modeling both single-level and multi-level latencies, allocate pages near processors that most frequently access that memory page. These schemes leverage performance monitoring capabilities of contemporary microprocessors to efficiently extract an approximate trace of memory accesses. This information is used to decide page affinity, i.e., the node to which the page is bound. The method operates entirely in user space, is widely automated, and handles not only static but also dynamic memory allocation. Experiments show that this method, although based on lossy tracing, can efficiently and effectively improve page placement, leading to an average wall-clock execution time saving of over 20% for the tested benchmarks on the SGI Altix with a 2x remote access penalty and 12% on AMD Opterons with a 1.3–2.0x access penalty. This is accompanied by a one-time tracing overhead of 2.7% over the overall original program wallclock time.}, number={12}, journal={JOURNAL OF PARALLEL AND DISTRIBUTED COMPUTING}, author={Marathe, Jaydeep and Thakkar, Vivek and Mueller, Frank}, year={2010}, month={Dec}, pages={1204–1219} } @article{marathe_mueller_mohan_mckee_de supinski_yoo_2007, title={METRIC: Memory tracing via dynamic binary rewriting to identify cache inefficiencies}, volume={29}, ISSN={["1558-4593"]}, DOI={10.1145/1216374.1216380}, abstractNote={With the diverging improvements in CPU speeds and memory access latencies, detecting and removing memory access bottlenecks becomes increasingly important. In this work we present METRIC, a software framework for isolating and understanding such bottlenecks using partial access traces. METRIC extracts access traces from executing programs without special compiler or linker support. We make four primary contributions. First, we present a framework for extracting partial access traces based on dynamic binary rewriting of the executing application. Second, we introduce a novel algorithm for compressing these traces. The algorithm generates constant space representations for regular accesses occurring in nested loop structures. Third, we use these traces for offline incremental memory hierarchy simulation. We extract symbolic information from the application executable and use this to generate detailed source-code correlated statistics including per-reference metrics, cache evictor information, and stream metrics. Finally, we demonstrate how this information can be used to isolate and understand memory access inefficiencies. This illustrates a potential advantage of METRIC over compile-time analysis for sample codes, particularly when interprocedural analysis is required.}, number={2}, journal={ACM TRANSACTIONS ON PROGRAMMING LANGUAGES AND SYSTEMS}, author={Marathe, Jaydeep and Mueller, Frank and Mohan, Tushar and McKee, Sally A. and De Supinski, Bronis R. and Yoo, Andy}, year={2007}, month={Mar} } @article{marathe_mueller_2007, title={Source-code-correlated cache coherence characterization of OpenMP benchmarks}, volume={18}, ISSN={["1558-2183"]}, DOI={10.1109/TPDS.2007.1058}, abstractNote={Cache coherence in shared-memory multiprocessor systems has been studied mostly from an architecture viewpoint, often by means of aggregating metrics. In many cases, aggregate events provide insufficient information for programmers to understand and optimize the coherence behavior of their applications. A better understanding would be given by source code correlations of not only aggregate events, but also finer granularity metrics directly linked to high-level source code constructs, such as source lines and data structures. In this paper, we explore a novel application-centric approach to studying coherence traffic. We develop a coherence analysis framework based on incremental coherence simulation of actual reference traces. We provide tool support to extract these reference traces and synchronization information from OpenMP threads at runtime using dynamic binary rewriting of the application executable. These traces are fed to ccSIM, our cache-coherence simulator. The novelty of ccSIM lies in its ability to relate low-level cache coherence metrics (such as coherence misses and their causative invalidations) to high-level source code constructs including source code locations and data structures. We explore the degree of freedom in interleaving data traces from different processors and assess simulation accuracy in comparison to metrics obtained from hardware performance counters. Our quantitative results show that: 1) Cache coherence traffic can be simulated with a considerable degree of accuracy for SPMD programs, as the invalidation traffic closely matches the corresponding hardware performance counters. 2) Detailed, high-level coherence statistics are very useful in detecting, isolating, and understanding coherence bottlenecks. We use ccSIM with several well-known benchmarks and find coherence optimization opportunities leading to significant reductions in coherence traffic and savings in wall-clock execution time}, number={6}, journal={IEEE TRANSACTIONS ON PARALLEL AND DISTRIBUTED SYSTEMS}, author={Marathe, Jaydeep and Mueller, Frank}, year={2007}, month={Jun}, pages={818–834} }