@article{lee_tuck_2013, title={Automatic Parallelization of Fine-Grained Metafunctions on a Chip Multiprocessor}, volume={10}, ISSN={["1544-3973"]}, DOI={10.1145/2541228.2541237}, abstractNote={Due to the importance of reliability and security, prior studies have proposed inlining metafunctions into applications for detecting bugs and security vulnerabilities. However, because these software techniques add frequent, fine-grained instrumentation to programs, they often incur large runtime overheads. In this work, we consider an automatic thread extraction technique for removing these fine-grained checks from a main application and scheduling them on helper threads. In this way, we can leverage the resources available on a CMP to reduce the latency and overhead of fine-grained checking codes. Our parallelization strategy extracts metafunctions from a single threaded application and executes them in customized helper threads—threads constructed to mirror relevant fragments of the main program’s behavior in order to keep communication and overhead low. To get good performance, we consider optimizations that reduce communication and balance work among many threads. We evaluate our parallelization strategy on Mudflap, a pointer-use checking tool in GCC. To show the benefits of our technique, we compare it to a manually parallelized version of Mudflap. We run our experiments on an architectural simulator with support for fast queueing operations. On a subset of SPECint 2000, our automatically parallelized code using static load balance is only 19% slower, on average, than the manually parallelized version on a simulated eight-core system. In addition, our automatically parallelized code using dynamic load balance is competitive, on average, to the manually parallelized version on a simulated eight-core system. Furthermore, all the applications except parser achieve better speedups with our automatic algorithms than with the manual approach. Also, our approach introduces very little overhead in the main program—it is kept under 100%, which is more than a 5.3× reduction compared to serial Mudflap.}, number={4}, journal={ACM TRANSACTIONS ON ARCHITECTURE AND CODE OPTIMIZATION}, author={Lee, Sanghoon and Tuck, James}, year={2013}, month={Dec} } @inproceedings{lee_tuck_2011, title={Automatic Parallelization of Fine-grained Meta-functions on a Chip Multiprocessor}, DOI={10.1109/cgo.2011.5764681}, abstractNote={Due to the importance of reliability and security, prior studies have proposed inlining meta-functions into applications for detecting bugs and security vulnerabilities. However, because these software techniques add frequent, finegrained instrumentation to programs, they often incur large runtime overheads. In this work, we consider an automatic thread extraction technique for removing these fine-grained checks from a main application and scheduling them on helper threads. In this way, we can leverage the resources available on a CMP to reduce the latency and overhead of fine-grained checking codes. Our parallelization strategy automatically extracts meta-functions from the main application and executes them in customized helper threads — threads constructed to mirror relevant fragments of the main program's behavior in order to keep communication and overhead low. To get good performance, we consider optimizations that reduce communication and balance work among many threads. We evaluate our parallelization strategy on Mudflap, a pointer-use checking tool in GCC. To show the benefits of our technique, we compare it to a manually parallelized version of Mudflap. We run our experiments on an architectural simulator with support for fast queueing operations. On a subset of SPECint 2000, our automatically parallelized code is only 29% slower, on average, than the manually parallelized version on a simulated 8-core system. Furthermore, two applications achieve better speedups using our algorithms than with the manual approach. Also, our approach introduces very little overhead in the main program — it is kept under 100%, which is more than a 5.3× reduction compared to serial Mudflap.}, booktitle={International Symposium on Code Generation and Optimization}, author={Lee, S. and Tuck, James}, year={2011}, pages={130–140} } @inproceedings{lee_tiwari_yan_tuck_2011, title={HAQu: Hardware-accelerated queueing for fine-grained threading on a chip multiprocessor}, DOI={10.1109/hpca.2011.5749720}, abstractNote={Queues are commonly used in multithreaded programs for synchronization and communication. However, because software queues tend to be too expensive to support finegrained parallelism, hardware queues have been proposed to reduce overhead of communication between cores. Hardware queues require modifications to the processor core and need a custom interconnect. They also pose difficulties for the operating system because their state must be preserved across context switches. To solve these problems, we propose a hardware-accelerated queue, or HAQu. HAQu adds hardware to a CMP that accelerates operations on software queues. Our design implements fast queueing through an application's address space with operations that are compatible with a fully software queue. Our design provides accelerated and OS-transparent performance in three general ways: (1) it provides a single instruction for enqueueing and dequeueing which significantly reduces the overhead when used in fine-grained threading; (2) operations on the queue are designed to leverage low-level details of the coherence protocol; and (3) hardware ensures that the full state of the queue is stored in the application's address space, thereby ensuring virtualization. We have evaluated our design in the context of application domains: offloading fine-grained checks for improved software reliability, and automatic, fine-grained parallelization using decoupled software pipelining.}, booktitle={International symposium on high-performance computer}, author={Lee, S. and Tiwari, D. and Yan, S. H. and Tuck, J.}, year={2011}, pages={99–110} }