@article{tiwari_solihin_2012, title={Modeling and Analyzing Key Performance Factors of Shared Memory MapReduce}, ISSN={["1530-2075"]}, DOI={10.1109/ipdps.2012.119}, abstractNote={MapReduce parallel programming model has seen wide adoption in data center applications. Recently, lightweight, fast, in-memory MapReduce runtime systems have been proposed for shared memory systems. However, what factors affect performance and what performance bottlenecks exist for a given program, are not well understood. This paper builds an analytical model to capture key performance factors of shared memory MapReduce and investigates important performance trends and behavior. Our study discovers several important findings and implications for system designers, performance tuners, and programmers. Our model quantifies relative contribution of different key performance factors for both map and reduce phases, and shows that performance of MapReduce programs are highly input-content dependent. Our model reveals that performance is heavily affected by the order in which distinct keys are encountered during the Map phase, and the frequency of these distinct keys. Our model points out cases in which reduce phase time dominates the total execution time. We also show that data-structure and algorithm design choices affect map and reduce phases differently and sometimes affecting map phase positively while affecting reduce phase negatively. Finally, we propose an application classification framework that can be used to reason about performance bottlenecks for a given application.}, journal={2012 IEEE 26TH INTERNATIONAL PARALLEL AND DISTRIBUTED PROCESSING SYMPOSIUM (IPDPS)}, author={Tiwari, Devesh and Solihin, Yan}, year={2012}, pages={1306–1317} } @inproceedings{lee_tiwari_yan_tuck_2011, title={HAQu: Hardware-accelerated queueing for fine-grained threading on a chip multiprocessor}, DOI={10.1109/hpca.2011.5749720}, abstractNote={Queues are commonly used in multithreaded programs for synchronization and communication. However, because software queues tend to be too expensive to support finegrained parallelism, hardware queues have been proposed to reduce overhead of communication between cores. Hardware queues require modifications to the processor core and need a custom interconnect. They also pose difficulties for the operating system because their state must be preserved across context switches. To solve these problems, we propose a hardware-accelerated queue, or HAQu. HAQu adds hardware to a CMP that accelerates operations on software queues. Our design implements fast queueing through an application's address space with operations that are compatible with a fully software queue. Our design provides accelerated and OS-transparent performance in three general ways: (1) it provides a single instruction for enqueueing and dequeueing which significantly reduces the overhead when used in fine-grained threading; (2) operations on the queue are designed to leverage low-level details of the coherence protocol; and (3) hardware ensures that the full state of the queue is stored in the application's address space, thereby ensuring virtualization. We have evaluated our design in the context of application domains: offloading fine-grained checks for improved software reliability, and automatic, fine-grained parallelization using decoupled software pipelining.}, booktitle={International symposium on high-performance computer}, author={Lee, S. and Tiwari, D. and Yan, S. H. and Tuck, J.}, year={2011}, pages={99–110} }