@article{volkel_lin_hook_timp_keung_tuck_2023, title={FrameD: framework for DNA-based data storage design, verification, and validation}, volume={39}, ISSN={["1367-4811"]}, url={https://doi.org/10.1093/bioinformatics/btad572}, DOI={10.1093/bioinformatics/btad572}, abstractNote={Abstract}, number={10}, journal={BIOINFORMATICS}, author={Volkel, Kevin D. and Lin, Kevin N. and Hook, Paul W. and Timp, Winston and Keung, Albert J. and Tuck, James M.}, editor={Kelso, JanetEditor}, year={2023}, month={Oct} } @article{elnawawy_tuck_byrd_2023, title={PreFlush: Lightweight Hardware Prediction Mechanism for Cache Line Flush and Writeback}, ISSN={["1089-795X"]}, DOI={10.1109/PACT58117.2023.00015}, abstractNote={Non-Volatile Main Memory (NVMM) technologies make it possible for applications to permanently store data in memory. To do so, they need to make sure that updates to persistent data comply with the crash consistency model, which often involves explicitly flushing a dirty cache line after a store and then waiting for the flush operation to complete using a store fence. While cache line flush and write back instructions can complete in the background, fence instructions expose the latency of flushing to the critical path of the program's execution, incurring significant overheads. If flush operations are started earlier, the penalty of fences can be significantly reduced. We propose PreFlush, a lightweight and transparent hardware mechanism that predicts when a cache line flush or write back is needed and speculatively performs the operation early. Since we speculatively perform the flush, we add hardware to handle flush misspeculation to ensure correct execution of the code without the need for any complex recovery mechanisms. Our PreFlush design is transparent to the programmer (i.e. it requires no modification on existing NVMM-enabled code). Our results show that PreFlush can improve performance by up to 25% (15.7% average) for the WHISPER NVM benchmark suite and loop-based matrix microbenchmarks.}, journal={2023 32ND INTERNATIONAL CONFERENCE ON PARALLEL ARCHITECTURES AND COMPILATION TECHNIQUES, PACT}, author={Elnawawy, Hussein and Tuck, James and Byrd, Gregory T.}, editor={Tuck, James and Byrd, GregoryEditors}, year={2023}, pages={74–85} } @article{han_tuck_awad_2023, title={Thoth: Bridging the Gap Between Persistently Secure Memories and Memory Interfaces of Emerging NVMs}, ISSN={["1530-0897"]}, DOI={10.1109/HPCA56546.2023.10070991}, abstractNote={Emerging non-volatile memories (NVMs) are expected to be part of future computing systems, including cloud systems and edge devices. In addition to the high density (and hence large capacities) NVMs can provide, they feature ultra-low idle power which makes them very promising for edge computing and data centers. Additionally, NVMs’ ability to retain data upon system crash (e.g., power outage or software bug) makes them a great candidate for high-availability and persistent applications. However, NVMs’ data retention capability brings in security challenges and further complicates today’s secure memory implementations; to ensure correct and secure system recovery, the data and security metadata must be persisted atomically (i.e., up-to-date in memory upon a crash).Despite the many efforts for rethinking secure memory implementations to enable crash-consistency, we observe that the state-of-the-art solutions are based on a major assumption that may not be suitable for future memory interfaces. Specifically, the majority of today’s solutions assume that either the encryption counter and/or message-authentication code (MAC) can be co-located with data by directly or indirectly leveraging the otherwise Error-Correcting Codes (ECC) bits. However, we observe that emerging interfaces and standards delegate the ECC calculation and management to happen inside the memory module, which makes it possible to remove extra bits for ECC in memory interfaces. Thus, all today’s solutions may need to separately persist the encrypted data, its MAC, and its encryption counter upon each memory write. To mitigate this issue, we propose a novel solution, Thoth, which leverages a novel off-chip persistent partial updates combine buffer that can ensure crash consistency at the cost of a fraction of the write amplification by the state-of-the-art solutions when adapted to future interfaces. Based on our evaluation, Thoth improves the performance by an average of 1.22x (up to 1.44x) while reducing write traffic by an average of 32% (up to 40%) compared to the baseline Anubis when adapted to future interfaces.}, journal={2023 IEEE INTERNATIONAL SYMPOSIUM ON HIGH-PERFORMANCE COMPUTER ARCHITECTURE, HPCA}, author={Han, Xijing and Tuck, James and Awad, Amro}, year={2023}, pages={94–107} } @article{volkel_tomek_keung_tuck_2022, title={DINOS: Data INspired Oligo Synthesis for DNA Data Storage}, volume={18}, ISSN={["1550-4840"]}, url={http://dx.doi.org/10.1145/3510853}, DOI={10.1145/3510853}, abstractNote={As interest in DNA-based information storage grows, the costs of synthesis have been identified as a key bottleneck. A potential direction is to tune synthesis for data. Data strands tend to be composed of a small set of recurring code word sequences, and they contain longer sequences of repeated data. To exploit these properties, we propose a new framework called DINOS. DINOS consists of three key parts: (i) The first is a hierarchical strand assembly algorithm, inspired by gene assembly techniques that can assemble arbitrary data strands from a small set of primitive blocks. (ii) The assembly algorithm relies on our novel formulation for how to construct primitive blocks, spanning a variety of useful configurations from a set of code words and overhangs. Each primitive block is a code word flanked by a pair of overhangs that are created by a cyclic pairing process that keeps the number of primitive blocks small. Using these primitive blocks, any data strand of arbitrary length can be assembled, theoretically. We show a minimal system for a binary code with as few as six primitive blocks, and we generalize our processes to support an arbitrary set of overhangs and code words. (iii) We exploit our hierarchical assembly approach to identify redundant sequences and coalesce the reactions that create them to make assembly more efficient.}, number={3}, journal={ACM JOURNAL ON EMERGING TECHNOLOGIES IN COMPUTING SYSTEMS}, publisher={Association for Computing Machinery (ACM)}, author={Volkel, Kevin and Tomek, Kyle J. and Keung, Albert J. and Tuck, James M.}, year={2022}, month={Jul} } @article{han_tuck_awad_2022, title={Horus: Persistent Security for Extended Persistence-Domain Memory Systems}, ISSN={["1072-4451"]}, DOI={10.1109/MICRO56248.2022.00087}, abstractNote={Persistent memory presents a great opportunity for crash-consistent computing in large-scale computing systems. The ability to recover data upon power outage or crash events can significantly improve the availability of large-scale systems, while improving the performance of persistent data applications (e.g., database applications). However, persistent memory suffers from high write latency and requires specific programming model (e.g., Intel’s PMDK) to guarantee crash consistency, which results in long latency to persist data. To mitigate these problems, recent standards advocate for sufficient back-up power that can flush the whole cache hierarchy to the persistent memory upon detection of an outage, i.e., extending the persistence domain to include the cache hierarchy. In the secure NVM with extended persistent domain(EPD), in addition to flushing the cache hierarchy, extra actions need to be taken to protect the flushed cache data. These extra actions of secure operation could cause significant burden on energy costs and battery size. We demonstrate that naive implementations could lead to significantly expanding the required power holdup budget (e.g., 10.3x more operations than EPD system without secure memory support). The significant overhead is caused by memory accesses of secure metadata. In this paper, we present Horus, a novel EPD-aware secure memory implementation. Horus reduces the overhead during draining period of EPD system by reducing memory accesses of secure metadata. Experiment result shows that Horus reduces the draining time by 5x, compared with the naive baseline design.}, journal={2022 55TH ANNUAL IEEE/ACM INTERNATIONAL SYMPOSIUM ON MICROARCHITECTURE (MICRO)}, author={Han, Xijing and Tuck, James and Awad, Amro}, year={2022}, pages={1255–1269} } @article{alshboul_ramrakhyani_wang_tuck_solihin_2021, title={BBB: Simplifying Persistent Programming using Battery-Backed Buffers}, ISSN={["1530-0897"]}, DOI={10.1109/HPCA51647.2021.00019}, abstractNote={Non-volatile memory (NVM) is poised to augment or replace DRAM as main memory. With the right abstraction and support, non-volatile main memory (NVMM) can provide an alternative to the storage system to host long-lasting persistent data. However, keeping persistent data in memory requires programs to be written such that data is crash consistent (i.e. it can be recovered after failure). Critical to supporting crash recovery is the guarantee of ordering of when stores become durable with respect to program order. Strict persistency, which requires persist order to coincide with program order of stores, is simple and intuitive but generally thought to be too slow. More relaxed persistency models are available but demand higher programming complexity, e.g. they require the programmer to insert persist barriers correctly in their program.We identify the source of strict persistency inefficiency as the gap between the point of visibility (PoV) which is the cache, and the point of persistency (PoP) which is the memory. In this paper, we propose a new approach to close the PoV/PoP gap which we refer to as Battery-Backed Buffer (BBB). The key idea of BBB is to provide a battery-backed persist buffer (bbPB) in each core next to the L1 data cache (L1D). A store value is allocated in the bbPB as it is written to cache, becoming part of the persistence domain. If a crash occurs, battery ensures bbPB can be fully drained to NVMM. BBB simplifies persistent programming as the programmer does not need to insert persist barriers or flushes. Furthermore, our BBB design achieves nearly identical results to eADR in terms of performance and number of NVMM writes, while requiring two orders of magnitude smaller energy and time to drain.}, journal={2021 27TH IEEE INTERNATIONAL SYMPOSIUM ON HIGH-PERFORMANCE COMPUTER ARCHITECTURE (HPCA 2021)}, author={Alshboul, Mohammad and Ramrakhyani, Prakash and Wang, William and Tuck, James and Solihin, Yan}, year={2021}, pages={111–124} } @misc{matange_tuck_keung_2021, title={DNA stability: a central design consideration for DNA data storage systems}, volume={12}, ISSN={["2041-1723"]}, url={https://doi.org/10.1038/s41467-021-21587-5}, DOI={10.1038/s41467-021-21587-5}, abstractNote={Abstract}, number={1}, journal={NATURE COMMUNICATIONS}, author={Matange, Karishma and Tuck, James M. and Keung, Albert J.}, year={2021}, month={Mar} } @inproceedings{han_tuck_awad_2021, title={Dolos: Improving the Performance of Persistent Applications in ADR-Supported Secure Memory}, url={http://dx.doi.org/10.1145/3466752.3480118}, DOI={10.1145/3466752.3480118}, abstractNote={The performance of persistent applications is severely hurt by current secure processor architectures. Persistent applications use long-latency flush instructions and memory fences to make sure that writes to persistent data reach the persistency domain in a way that is crash consistent. Recently introduced features like Intel’s Asynchronous DRAM Refresh (ADR) make the on-chip Write Pending Queue (WPQ) part of the persistency domain and help reduce the penalty of persisting data since data only needs to reach the on-chip WPQ to be considered persistent. However, when persistent applications run on secure processors, for the sake of securing memory many cycles are added to the critical path of their write operations before they ever reach the persistent WPQ, preventing them from fully exploiting the performance advantages of the persistent WPQ. Our goal in this work is to make it feasible for secure persistent applications to benefit more from the on-chip persistency domain. We propose Dolos, an architecture that prioritizes persisting data without sacrificing security in order to gain a significant performance boost for persistent applications. Dolos achieves this goal by an additional minor security unit, Mi-SU, that utilizes a much faster secure process that protects only the WPQ. Thus, the secure operation latency in the critical path of persist operations is reduced and hence persistent transactions can complete earlier. Dolos retains a conventional major security unit for protecting memory that occurs off the critical path after inserting secured data into the WPQ. To evaluate our design, we implemented our architecture in the GEM5 simulator, and analyzed the performance of 6 benchmarks from the WHISPER suite. Dolos improves their performance by 1.66x on average.}, booktitle={MICRO-54: 54th Annual IEEE/ACM International Symposium on Microarchitecture}, publisher={ACM}, author={Han, Xijing and Tuck, James and Awad, Amro}, year={2021}, month={Oct} } @article{tomek_volkel_indermaur_tuck_keung_2021, title={Promiscuous molecules for smarter file operations in DNA-based data storage}, volume={12}, ISSN={["2041-1723"]}, url={https://doi.org/10.1038/s41467-021-23669-w}, DOI={10.1038/s41467-021-23669-w}, abstractNote={Abstract}, number={1}, journal={NATURE COMMUNICATIONS}, author={Tomek, Kyle J. and Volkel, Kevin and Indermaur, Elaine W. and Tuck, James M. and Keung, Albert J.}, year={2021}, month={Jun} } @article{lin_volkel_tuck_keung_2020, title={Dynamic and scalable DNA-based information storage}, volume={11}, ISSN={["2041-1723"]}, url={https://doi.org/10.1038/s41467-020-16797-2}, DOI={10.1038/s41467-020-16797-2}, abstractNote={Abstract}, number={1}, journal={NATURE COMMUNICATIONS}, publisher={Springer Science and Business Media LLC}, author={Lin, Kevin N. and Volkel, Kevin and Tuck, James M. and Keung, Albert J.}, year={2020}, month={Jun} } @article{solihin_alshboul_tuck_2020, title={Methods of crash recovery for data stored in non-volatile main memory}, note={US Patent App. 16/564,479}, author={Solihin, Yan and Alshboul, Mohammad and Tuck, James}, year={2020}, month={Mar} } @article{wang_tuck_2020, title={Persistent Data Retention Models}, journal={arXiv preprint arXiv:2009.14705}, author={Wang, Tiancong and Tuck, James}, year={2020} } @article{samara_tuck_2020, title={The Case for Domain-Specialized Branch Predictors for Graph-Processing}, volume={19}, ISSN={["1556-6064"]}, DOI={10.1109/LCA.2020.3005895}, abstractNote={Branch prediction is believed by many to be a solved problem, with state-of-the-art predictors achieving near-perfect prediction for many programs. In this article, we conduct a detailed simulation of graph-processing workloads in the GAPBS benchmark suite and show that branch mispredictions occur frequently and are still a large limitation on performance in key graph-processing applications. We provide a detailed analysis of which branches are mispredicting and show that a few key branches are the main source of performance degradation across the graph-processing benchmarks we looked at. We also propose a few ideas for future work to improve branch prediction accuracy on graph workloads.}, number={2}, journal={IEEE COMPUTER ARCHITECTURE LETTERS}, publisher={IEEE}, author={Samara, Ahmed and Tuck, James}, year={2020}, pages={101–104} } @inproceedings{alshboul_tuck_solihin_2020, title={WET: write efficient loop tiling for non-volatile main memory}, booktitle={2020 57th ACM/IEEE Design Automation Conference (DAC)}, author={Alshboul, Mohammad and Tuck, James and Solihin, Yan}, year={2020}, pages={1–6} } @article{tomek_volkel_simpson_hass_indermaur_tuck_keung_2019, title={Driving the Scalability of DNA-Based Information Storage Systems}, volume={8}, ISSN={["2161-5063"]}, DOI={10.1021/acssynbio.9b00100}, abstractNote={The extreme density of DNA presents a compelling advantage over current storage media; however, to reach practical capacities, new systems for organizing and accessing information are needed. Here, we use chemical handles to selectively extract unique files from a complex database of DNA mimicking 5 TB of data and design and implement a nested file address system that increases the theoretical maximum capacity of DNA storage systems by five orders of magnitude. These advancements enable the development and future scaling of DNA-based data storage systems with modern capacities and file access capabilities.}, number={6}, journal={ACS SYNTHETIC BIOLOGY}, publisher={American Chemical Society}, author={Tomek, Kyle J. and Volkel, Kevin and Simpson, Alexander and Hass, Austin G. and Indermaur, Elaine W. and Tuck, James M. and Keung, Albert J.}, year={2019}, month={Jun}, pages={1241–1248} } @article{lin_keung_tuck_2019, title={Dynamic DNA-based information storage}, journal={bioRxiv}, publisher={Cold Spring Harbor Laboratory}, author={Lin, Kevin N and Keung, Albert J and Tuck, James M}, year={2019}, pages={836429} } @article{koryachko_matthiadis_haque_muhammad_ducoste_tuck_long_williams_2019, title={Dynamic modelling of the iron deficiency modulated transcriptome response in Arabidopsis thaliana roots}, volume={1}, number={1}, journal={in silico Plants}, publisher={Oxford University Press US}, author={Koryachko, Alexandr and Matthiadis, Anna and Haque, Samiul and Muhammad, Durreshahwar and Ducoste, Joel J and Tuck, James M and Long, Terri A and Williams, Cranos M}, year={2019}, pages={diz005} } @article{alshboul_elnawawy_elkhouly_kimura_tuck_solihin_2019, title={Efficient Checkpointing with Recompute Scheme for Non-volatile Main Memory}, volume={16}, ISSN={["1544-3973"]}, DOI={10.1145/3323091}, abstractNote={Future main memory will likely include Non-Volatile Memory. Non-Volatile Main Memory (NVMM) provides an opportunity to rethink checkpointing strategies for providing failure safety to applications. While there are many checkpointing and logging schemes in the literature, their use must be revisited as they incur high execution time overheads as well as a large number of additional writes to NVMM, which may significantly impact write endurance.}, number={2}, journal={ACM TRANSACTIONS ON ARCHITECTURE AND CODE OPTIMIZATION}, publisher={ACM New York, NY, USA}, author={Alshboul, Mohammad and Elnawawy, Hussein and Elkhouly, Reem and Kimura, Keiji and Tuck, James and Solihin, Yan}, year={2019}, month={May} } @article{wang_sambasivam_tuck_2018, title={Hardware Supported Permission Checks On Persistent Objects for Performance and Programmability}, ISSN={["1063-6897"]}, DOI={10.1109/ISCA.2018.00046}, abstractNote={Non-Volatile Memory technologies are advancing rapidly and may replace DRAM in future systems. However, a key question is how programmers will use them to construct and manipulate persistent data. One possible approach gives programmers direct access to persistent memory using relocatable persistent pools that hold persistent objects which can be accessed using persistent pointers, called ObjectIDs. Prior work has shown that hardware-supported address translation for ObjectIDs provides significant performance improvement and simplifies programming, however these works did not consider the large overheads incurred to check permissions before accessing persistent objects. In this paper, we identify permission checking in hardware as a critical mechanism that must be included when translating ObjectIDs to addresses in order to simplify programming and fully benefit from hardware translation. To support it, we add a System Persistent Object Table (SPOT) to support translation and permissions checks on ObjectIDs. The SPOT holds all known pools, their physical address, and their permissions information in memory. When a program attempts to access a persistent object, the SPOT is consulted and permissions are verified without trapping to the operating system. We have implemented our new design in a cycle accurate simulator and compared it with software only approaches and prior work. We find that our design offers a compelling 3.3x speedup on average for microbenchmarks that access pools with the RANDOM pattern and 1.4x and 1.7x speedup on TPC-C and vacation, respectively, for the SEPARATE pattern.}, journal={2018 ACM/IEEE 45TH ANNUAL INTERNATIONAL SYMPOSIUM ON COMPUTER ARCHITECTURE (ISCA)}, author={Wang, Tiancong and Sambasivam, Sakthikumaran and Tuck, James}, year={2018}, pages={466–478} } @inproceedings{wang_sambasivam_tuck_2018, title={Hardware supported permission checks on persistent objects for performance and programmability}, booktitle={2018 ACM/IEEE 45th Annual International Symposium on Computer Architecture (ISCA)}, author={Wang, Tiancong and Sambasivam, Sakthikumaran and Tuck, James}, year={2018}, pages={466–478} } @article{cohen_shen_torrellas_tuck_zhou_adve_akturk_bagchi_balasubramonian_barik_et al._2018, title={Inter-disciplinary research challenges in computer systems for the 2020s}, journal={National Science Foundation, USA, Tech. Rep}, author={Cohen, Albert and Shen, Xipeng and Torrellas, Josep and Tuck, James and Zhou, Yuanyuan and Adve, Sarita and Akturk, Ismail and Bagchi, Saurabh and Balasubramonian, Rajeev and Barik, Rajkishore and et al.}, year={2018} } @article{alshboul_tuck_solihin_2018, title={Lazy Persistency: a High-Performing and Write-Efficient Software Persistency Technique}, ISSN={["1063-6897"]}, DOI={10.1109/ISCA.2018.00044}, abstractNote={Emerging Non-Volatile Memories (NVMs) are expected to be included in future main memory, providing the opportunity to host important data persistently in main memory. However, achieving persistency requires that programs be written with failure-safety in mind. Many persistency models and techniques have been proposed to help the programmer reason about failure-safety. They require that the programmer eagerly flush data out of caches to make it persistent. Eager persistency comes with a large overhead because it adds many instructions to the program for flushing cache lines and incurs costly stalls at barriers to wait for data to become durable. To reduce these overheads, we propose Lazy Persistency (LP), a software persistency technique that allows caches to slowly send dirty blocks to the NVMM through natural evictions. With LP, there are no additional writes to NVMM, no decrease in write endurance, and no performance degradation from cache line flushes and barriers. Persistency failures are discovered using software error detection (checksum), and the system recovers from them by recomputing inconsistent results. We describe the properties and design of LP and demonstrate how it can be applied to loop-based kernels popularly used in scientific computing. We evaluate LP and compare it to the state-of-the-art Eager Persistency technique from prior work. Compared to it, LP reduces the execution time and write amplification overheads from 9% and 21% to only 1% and 3%, respectively.}, journal={2018 ACM/IEEE 45TH ANNUAL INTERNATIONAL SYMPOSIUM ON COMPUTER ARCHITECTURE (ISCA)}, author={Alshboul, Mohammad and Tuck, James and Solihin, Yan}, year={2018}, pages={439–451} } @inproceedings{alshboul_tuck_solihin_2018, title={Lazy persistency: A high-performing and write-efficient software persistency technique}, booktitle={2018 ACM/IEEE 45th Annual International Symposium on Computer Architecture (ISCA)}, author={Alshboul, Mohammad and Tuck, James and Solihin, Yan}, year={2018}, pages={439–451} } @inproceedings{wibowo_agrawal_tuck_2017, title={Characterizing the impact of soft errors across microarchitectural structures and implications for predictability}, DOI={10.1109/iiswc.2017.8167782}, abstractNote={The trends of transistor size and system complexity scaling continue. As a result, soft errors in the system, including the processor core, are predicted to become one of the major reliability challenges. A fraction of soft errors at the device level could become an unmasked error visible to the user. Unmasked soft errors may manifest as a detectable error, which could be recoverable (DRE) or unrecoverable (DUE), or a Silent Data Corruption (SDC). Detecting and recovering from an SDC is especially challenging since an explicit checker is needed to detect erroneous state. Predicting when SDCs are more likely could be valuable in designing resilient systems. To gain insight, we evaluate the Architectural Vulnerability Factor (AVF) of all major in-core memory structures of an out-of-order superscalar processor. In particular, we focus on the vulnerability factors for detectable and unrecoverable errors (DUEAVF) and silent data corruptions (SDCAVF) across windows of execution to study their characteristics, time-varying behavior, and their predictability using a linear regression trained offline. We perform more than 35 million microarchitectural fault injection simulations and, if necessary, run-to-completion using functional simulations to determine AVF, DUEAVF, and SDCAVF. Our study shows that, similar to AVF, DUEAVF and SDCAVF vary over time and across applications. We also find significant differences in DUEAVF and SDCAVF across the processor structures we studied. Furthermore, we find that DUEAVF can be predicted using a linear regression with similar accuracy as AVF estimation. However, SDCAVF could not be predicted with the same level of accuracy. As a remedy, we propose adding a software vulnerability factor, in the form of SDCPVF, to the linear regression model for estimating SDCAVF. We find that SDCPVF of the Architectural Register File explains most of the behavior of SDCAVF for the combined microarchitectural structures studied in this paper. Our evaluation shows that the addition of SDCPVF improves the accuracy by 5.19×, on average, to a level similar to DUEAVF and AVF estimates. We also evaluate the impact of limiting software-layer reliability information to only 5 basic blocks (16× cost reduction, on average), and observe that it increases error only by 18.7%, on average.}, booktitle={2017 IEEE International Symposium on Workload Characterization (IISWC)}, author={Wibowo, Bagus and Agrawal, Abhinav and Tuck, James}, year={2017}, pages={250–260} } @inproceedings{elnawawy_alshboul_tuck_solihin_2017, title={Efficient checkpointing of loop-based codes for non-volatile main memory}, booktitle={2017 26th International Conference on Parallel Architectures and Compilation Techniques (PACT)}, author={Elnawawy, Hussein and Alshboul, Mohammad and Tuck, James and Solihin, Yan}, year={2017}, pages={318–329} } @inproceedings{wang_sambasivam_solihin_tuck_2017, title={Hardware supported persistent object address translation}, booktitle={2017 50th Annual IEEE/ACM International Symposium on Microarchitecture (MICRO)}, author={Wang, Tiancong and Sambasivam, Sakthikumaran and Solihin, Yan and Tuck, James}, year={2017}, pages={800–812} } @article{shin_tuck_solihin_2017, title={Hiding the Long Latency of Persist Barriers Using Speculative Execution}, DOI={10.1145/3079856.3080240}, abstractNote={Byte-addressable non-volatile memory technology is emerging as an alternative for DRAM for main memory. This new Non-Volatile Main Memory (NVMM) allows programmers to store important data in data structures in memory instead of serializing it to the file system, thereby providing a substantial performance boost. However, modern systems reorder memory operations and utilize volatile caches for better performance, making it difficult to ensure a consistent state in NVMM. Intel recently announced a new set of persistence instructions, clflushopt, clwb, and pcommit. These new instructions make it possible to implement fail-safe code on NVMM, but few workloads have been written or characterized using these new instructions. In this work, we describe how these instructions work and how they can be used to implement write-ahead logging based transactions. We implement several common data structures and kernels and evaluate the performance overhead incurred over traditional non-persistent implementations. In particular, we find that persistence instructions occur in clusters along with expensive fence operations, they have long latency, and they add a significant execution time overhead, on average by 20.3% over code with logging but without fence instructions to order persists. To deal with this overhead and alleviate the performance bottleneck, we propose to speculate past long latency persistency operations using checkpoint-based processing. Our speculative persistence architecture reduces the execution time overheads to only 3.6%.}, journal={44TH ANNUAL INTERNATIONAL SYMPOSIUM ON COMPUTER ARCHITECTURE (ISCA 2017)}, author={Shin, Seunghee and Tuck, James and Solihin, Yan}, year={2017}, pages={175–186} } @inproceedings{shin_tuck_solihin_2017, title={Hiding the long latency of persist barriers using speculative execution}, booktitle={Proceedings of the 44th Annual International Symposium on Computer Architecture}, author={Shin, Seunghee and Tuck, James and Solihin, Yan}, year={2017}, pages={175–186} } @inproceedings{huh_tuck_2017, title={Improving the effectiveness of searching for isomorphic chains in superword level parallelism}, booktitle={2017 50th Annual IEEE/ACM International Symposium on Microarchitecture (MICRO)}, author={Huh, Joonmoo and Tuck, James}, year={2017}, pages={718–729} } @inproceedings{agrawal_loh_tuck_2017, title={Leveraging near data processing for high-performance checkpoint/restart}, booktitle={Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis}, author={Agrawal, Abhinav and Loh, Gabriel H and Tuck, James}, year={2017}, pages={1–12} } @inproceedings{shin_tirukkovalluri_tuck_solihin_2017, title={Proteus: A flexible and fast software supported hardware logging approach for nvm}, booktitle={Proceedings of the 50th Annual IEEE/ACM International Symposium on Microarchitecture}, author={Shin, Seunghee and Tirukkovalluri, Satish Kumar and Tuck, James and Solihin, Yan}, year={2017}, pages={178–190} } @article{patsilaras_tuck_2017, title={ReDirect: Reconfigurable Directories for Multicore Architectures}, volume={14}, ISSN={["1544-3973"]}, DOI={10.1145/3162015}, abstractNote={As we enter the dark silicon era, architects should not envision designs in which every transistor remains turned on permanently but rather ones in which portions of the chip are judiciously turned on/off depending on the characteristics of a workload. At the same time, due to the increasing cost per transistor, architects should also consider new ways to re-purpose transistors to increase their architectural value.}, number={4}, journal={ACM TRANSACTIONS ON ARCHITECTURE AND CODE OPTIMIZATION}, publisher={ACM New York, NY, USA}, author={Patsilaras, George and Tuck, James}, year={2017}, month={Dec} } @article{wibowo_agrawal_stanton_tuck_2016, title={An Accurate Cross-Layer Approach for Online Architectural Vulnerability Estimation}, volume={13}, ISSN={["1544-3973"]}, DOI={10.1145/2975588}, abstractNote={Processor soft-error rates are projected to increase as feature sizes scale down, necessitating the adoption of reliability-enhancing techniques, but power and performance overhead remain a concern of such techniques. Dynamic cross-layer techniques are a promising way to improve the cost-effectiveness of resilient systems. As a foundation for making such a system, we propose a cross-layer approach for estimating the architectural vulnerability of a processor core online that works by combining information from software, compiler, and microarchitectural layers at runtime. The hardware layer combines the metadata from software and compiler layers with microarchitectural measurements to estimate architectural vulnerability online. We describe our design and evaluate it in detail on a set of SPEC CPU 2006 applications. We find that our online AVF estimate is highly accurate with respect to a postmortem AVF analysis, with only 0.46% average absolute error. Also, our design incurs negligible performance impact for SPEC2006 applications and about 1.2% for a Monte Carlo application, requires approximately 1.4% area overhead, and costs about 3.3% more power on average. We compare our technique against two prior online AVF estimation techniques, one using a linear regression to estimate AVF and another based on PVF-HVF; our evaluation finds that our approach, on average, is more accurate. Our case study of a Monte Carlo simulation shows that our AVF estimate can adapt to the inherent resiliency of the algorithm. Finally, we demonstrate the effectiveness of our approach using a dynamic protection scheme that limits vulnerability to soft errors while reducing the energy consumption by an average of 4.8%, and with a target normalized SER of 10%, compared to enabling a simple parity+ECC protection at all times.}, number={3}, journal={ACM TRANSACTIONS ON ARCHITECTURE AND CODE OPTIMIZATION}, publisher={ACM New York, NY, USA}, author={Wibowo, Bagus and Agrawal, Abhinav and Stanton, Thomas and Tuck, James}, year={2016}, month={Sep} } @article{shen_mueller_tuck_2016, title={Languages and Compilers for Parallel Computing: 28th International Workshop, LCPC 2015, Raleigh, NC, USA, September 9-11, 2015, Revised Selected Papers}, publisher={Springer}, author={Shen, Xipeng and Mueller, Frank and Tuck, James}, year={2016} } @article{milewicz_vanka_tuck_quinlan_pirkelbauer_2016, title={Lightweight runtime checking of C programs with RTC}, volume={45}, ISSN={["1873-6866"]}, DOI={10.1016/j.cl.2016.01.001}, abstractNote={The C Programming Language is known for being an efficient language that can be compiled on almost any architecture and operating system. However the absence of dynamic safety checks and a relatively weak type system allows programmer oversights that are hard to spot. In this paper, we present RTC, a runtime monitoring tool that instruments unsafe code and monitors the program execution. RTC is built on top of the ROSE compiler infrastructure. RTC finds memory bugs and arithmetic overflows and underflows, and run-time type violations. Most of the instrumentations are directly added to the source file and only require a minimal runtime system. As a result, the instrumented code remains portable. In tests against known error detection benchmarks, RTC found 98% of all memory related bugs and had zero false positives. In performance tests conducted with well known algorithms, such as binary search and MD5, we determined that our tool has an average run-time overhead rate of 9.7× and memory overhead rate of 3.5×.}, journal={COMPUTER LANGUAGES SYSTEMS & STRUCTURES}, author={Milewicz, Reed and Vanka, Rajesh and Tuck, James and Quinlan, Daniel and Pirkelbauer, Peter}, year={2016}, month={Apr}, pages={191–203} } @article{milewicz_vanka_tuck_quinlan_pirkelbauer_2016, title={Lightweight runtime checking of C programs with RTC}, volume={45}, journal={Computer Languages, Systems & Structures}, publisher={Elsevier}, author={Milewicz, Reed and Vanka, Rajesh and Tuck, James and Quinlan, Daniel and Pirkelbauer, Peter}, year={2016}, pages={191–203} } @article{koryachko_matthiadis_muhammad_foret_brady_ducoste_tuck_long_williams_2015, title={Clustering and Differential Alignment Algorithm: Identification of Early Stage Regulators in the Arabidopsis thaliana Iron Deficiency Response}, volume={10}, ISSN={["1932-6203"]}, url={http://www.scopus.com/inward/record.url?eid=2-s2.0-84943338816&partnerID=MN8TOARS}, DOI={10.1371/journal.pone.0136591}, abstractNote={Time course transcriptome datasets are commonly used to predict key gene regulators associated with stress responses and to explore gene functionality. Techniques developed to extract causal relationships between genes from high throughput time course expression data are limited by low signal levels coupled with noise and sparseness in time points. We deal with these limitations by proposing the Cluster and Differential Alignment Algorithm (CDAA). This algorithm was designed to process transcriptome data by first grouping genes based on stages of activity and then using similarities in gene expression to predict influential connections between individual genes. Regulatory relationships are assigned based on pairwise alignment scores generated using the expression patterns of two genes and some inferred delay between the regulator and the observed activity of the target. We applied the CDAA to an iron deficiency time course microarray dataset to identify regulators that influence 7 target transcription factors known to participate in the Arabidopsis thaliana iron deficiency response. The algorithm predicted that 7 regulators previously unlinked to iron homeostasis influence the expression of these known transcription factors. We validated over half of predicted influential relationships using qRT-PCR expression analysis in mutant backgrounds. One predicted regulator-target relationship was shown to be a direct binding interaction according to yeast one-hybrid (Y1H) analysis. These results serve as a proof of concept emphasizing the utility of the CDAA for identifying unknown or missing nodes in regulatory cascades, providing the fundamental knowledge needed for constructing predictive gene regulatory networks. We propose that this tool can be used successfully for similar time course datasets to extract additional information and infer reliable regulatory connections for individual genes.}, number={8}, journal={PLOS ONE}, publisher={Public Library of Science}, author={Koryachko, Alexandr and Matthiadis, Anna and Muhammad, Durreshahwar and Foret, Jessica and Brady, Siobhan M. and Ducoste, Joel J. and Tuck, James and Long, Terri A. and Williams, Cranos}, year={2015}, month={Aug} } @article{koryachko_matthiadis_ducoste_tuck_long_williams_2015, title={Computational approaches to identify regulators of plant stress response using high-throughput gene expression data}, volume={3-4}, ISSN={2214-6628}, url={http://dx.doi.org/10.1016/J.CPB.2015.04.001}, DOI={10.1016/J.CPB.2015.04.001}, abstractNote={Insight into biological stress regulatory pathways can be derived from high-throughput transcriptomic data using computational algorithms. These algorithms can be integrated into a computational approach to provide specific testable predictions that answer biological questions of interest. This review conceptually organizes a wide variety of developed algorithms into a classification system based on desired type of output predictions. This classification is then used as a structure to describe completed approaches in the literature, with a focus on project goals, overall path of implemented algorithms, and biological insight gained. These algorithms and approaches are introduced mainly in the context of research on the model plant species Arabidopsis thaliana under stress conditions, though the nature of computational techniques makes these approaches easily applicable to a wide range of species, data types, and conditions.}, journal={Current Plant Biology}, publisher={Elsevier BV}, author={Koryachko, Alexandr and Matthiadis, Anna and Ducoste, Joel J. and Tuck, James and Long, Terri A. and Williams, Cranos}, year={2015}, month={Sep}, pages={20–29} } @inproceedings{franzon_rotenberg_tuck_davis_zhou_schabel_zhang_dwiel_forbes_huh_et al._2015, title={Computing in 3D}, booktitle={2015 IEEE Custom Integrated Circuits Conference (CICC)}, author={Franzon, Paul and Rotenberg, Eric and Tuck, James and Davis, W Rhett and Zhou, Huiyang and Schabel, Joshua and Zhang, Zhenquian and Dwiel, J Brandon and Forbes, Elliott and Huh, Joonmoo and et al.}, year={2015}, pages={1–6} } @article{sheikh_tuck_rotenberg_2015, title={Control-Flow Decoupling: An Approach for Timely, Non-Speculative Branching}, volume={64}, ISSN={["1557-9956"]}, DOI={10.1109/tc.2014.2361526}, abstractNote={Mobile and PC/server class processor companies continue to roll out flagship core microarchitectures that are faster than their predecessors. Meanwhile placing more cores on a chip coupled with constant supply voltage puts per-core energy consumption at a premium. Hence, the challenge is to find future microarchitecture optimizations that not only increase performance but also conserve energy. Eliminating branch mispredictions-which waste both time and energy-is valuable in this respect. In this paper, we explore the control-flow landscape by characterizing mispredictions in four benchmark suites. We find that a third of mispredictions-per-1K-instructions (MPKI) come from what we call separable branches: branches with large control-dependent regions (not suitable for if-conversion), whose backward slices do not depend on their control-dependent instructions or have only a short dependence. We propose control-flow decoupling (CFD) to eradicate mispredictions of separable branches. The idea is to separate the loop containing the branch into two loops: the first contains only the branch's predicate computation and the second contains the branch and its control-dependent instructions. The first loop communicates branch outcomes to the second loop through an architectural queue. Microarchitecturally, the queue resides in the fetch unit to drive timely, non-speculative branching. On a microarchitecture configured similar to Intel's Sandy Bridge core, CFD increases performance by up to 55 percent, and reduces energy consumption by up to 49 percent (for CFD regions). Moreover, for some applications, CFD is a necessary catalyst for future complexity-effective large-window architectures to tolerate memory latency.}, number={8}, journal={IEEE TRANSACTIONS ON COMPUTERS}, author={Sheikh, Rami and Tuck, James and Rotenberg, Eric}, year={2015}, month={Aug}, pages={2182–2203} } @article{milewicz_vanka_tuck_quinlan_pirkelbauer_2015, title={Runtime Checking C Programs}, DOI={10.1145/2695664.2695906}, abstractNote={The C Programming Language is known for being an efficient language that can be compiled on almost any architecture and operating system. However the absence of dynamic safety checks and a relatively weak type system allows programmer oversights that are hard to spot. In this paper, we present RTC, a runtime monitoring tool that instruments unsafe code and monitors the program execution. RTC is built on top of the ROSE compiler infrastructure. RTC finds memory bugs and arithmetic overflows and underflows, and run-time type violations. Most of the instrumentations are directly added to the source file and only require a minimal runtime system. As a result, the instrumented code remains portable. In tests against known error detection benchmarks, RTC found 98% of all memory related bugs and had zero false positives. In performance tests conducted with well known algorithms, such as binary search and MD5, we determined that the unoptimized overhead rate is between a factor of 1.8 and a factor of 77 respectively.}, journal={30TH ANNUAL ACM SYMPOSIUM ON APPLIED COMPUTING, VOLS I AND II}, author={Milewicz, Reed and Vanka, Rajesh and Tuck, James and Quinlan, Daniel and Pirkelbauer, Peter}, year={2015}, pages={2107–2114} } @inproceedings{milewicz_vanka_tuck_quinlan_pirkelbauer_2015, title={Runtime checking C programs}, booktitle={Proceedings of the 30th Annual ACM Symposium on Applied Computing}, author={Milewicz, Reed and Vanka, Rajesh and Tuck, James and Quinlan, Daniel and Pirkelbauer, Peter}, year={2015}, pages={2107–2114} } @inproceedings{agrawal_wibowo_tuck_2015, title={Source Mark: A Source-Level Approach for Identifying Architecture and Optimization Agnostic Regions for Performance Analysis}, booktitle={2015 IEEE International Symposium on Workload Characterization}, author={Agrawal, Abhinav and Wibowo, Bagus and Tuck, James}, year={2015}, pages={160–171} } @article{agrawal_wibowo_tuck_2015, title={SourceMark: A Source-Level Approach for Identifying Architecture and Optimization Agnostic Regions for Performance Analysis}, DOI={10.1109/iiswc.2015.27}, abstractNote={Computer architects often evaluate performance on only parts of a program and not the entire program due to long simulation times that could take weeks or longer to finish. However, choosing regions of a program to evaluate in a way that is consistent and correct with respect to different compilers and different architectures is very challenging and has not received sufficient attention. The need for such tools is growing in importance given the diversity of architectures and compilers in use today. In this work, we propose a technique that identifies regions of a desired granularity for performance evaluation. We use a source-to-source compiler that inserts software marks into the program's source code to divide the execution into regions with a desired dynamic instruction count. An evaluation framework chooses from among a set of candidate marks to find ones that are both consistent across different architectures or compilers and can yield a low run-time instruction overhead. Evaluated on a set of SPEC applications, with a region size of about 100 million instructions, our technique has a dynamic instruction overhead as high as 3.3% with an average overhead of 0.47%. We also demonstrate the scalability of our technique by evaluating the dynamic instruction overhead for regions of finer granularity and show similar small overheads, of the applications we studied, we were unable to find suitable fine grained regions only for 462.libquantum and 444.namd. Our technique is an effective alternative to traditional binary-level approaches. We have demonstrated that a source-level approach is robust, that it can achieve low overhead, and that it reduces the effort for bringing up new architectures or compilers into an existing evaluation framework.}, journal={2015 IEEE INTERNATIONAL SYMPOSIUM ON WORKLOAD CHARACTERIZATION (IISWC)}, author={Agrawal, Abhinav and Wibowo, Bagus and Tuck, James}, year={2015}, pages={160–171} } @inproceedings{franzon_rotenberg_tuck_zhou_davis_dai_huh_ku_lipa_li_et al._2014, title={3D-enabled customizable embedded computer (3DECC)}, booktitle={2014 International 3D Systems Integration Conference (3DIC)}, author={Franzon, Paul D and Rotenberg, Eric and Tuck, James and Zhou, Huiyang and Davis, W Rhett and Dai, Hongwen and Huh, Joonmoo and Ku, Sunkgwan and Lipa, Steve and Li, Chao and et al.}, year={2014}, pages={1–3} } @article{sheikh_tuck_rotenberg_2014, title={Control-flow decoupling: An approach for timely, non-speculative branching}, volume={64}, number={8}, journal={IEEE Transactions on Computers}, publisher={IEEE}, author={Sheikh, Rami and Tuck, James and Rotenberg, Eric}, year={2014}, pages={2182–2203} } @inproceedings{franzon_rotenberg_tuck_davis_zhou_schabel_zhang_park_dwiel_forbes_et al._2013, title={Applications and design styles for 3DIC}, booktitle={2013 IEEE International Electron Devices Meeting}, author={Franzon, Paul D and Rotenberg, Eric and Tuck, James and Davis, W Rhett and Zhou, Huiyang and Schabel, Joshua and Zhang, Zhenquian and Park, J and Dwiel, Brandon and Forbes, Elliott and et al.}, year={2013}, pages={29–4} } @article{lee_tuck_2013, title={Automatic Parallelization of Fine-Grained Metafunctions on a Chip Multiprocessor}, volume={10}, ISSN={["1544-3973"]}, DOI={10.1145/2541228.2541237}, abstractNote={Due to the importance of reliability and security, prior studies have proposed inlining metafunctions into applications for detecting bugs and security vulnerabilities. However, because these software techniques add frequent, fine-grained instrumentation to programs, they often incur large runtime overheads. In this work, we consider an automatic thread extraction technique for removing these fine-grained checks from a main application and scheduling them on helper threads. In this way, we can leverage the resources available on a CMP to reduce the latency and overhead of fine-grained checking codes.}, number={4}, journal={ACM TRANSACTIONS ON ARCHITECTURE AND CODE OPTIMIZATION}, publisher={ACM New York, NY, USA}, author={Lee, Sanghoon and Tuck, James}, year={2013}, month={Dec} } @article{sheikh_tuck_rotenberg_2012, title={Control-Flow Decoupling}, ISBN={["978-1-4673-4819-5"]}, ISSN={["1072-4451"]}, DOI={10.1109/micro.2012.38}, abstractNote={Mobile and PC/server class processor companies continue to roll out flagship core micro architectures that are faster than their predecessors. Meanwhile placing more cores on a chip coupled with constant supply voltage puts per-core energy consumption at a premium. Hence, the challenge is to find future micro architecture optimizations that not only increase performance but also conserve energy. Eliminating branch mispredictions -- which waste both time and energy -- is valuable in this respect. We first explore the control-flow landscape by characterizing mispredictions in four benchmark suites. We find that a third of mispredictions-per-1K-instructions (MPKI) come from what we call separable branches: branches with large control-dependent regions (not suitable for if-conversion), whose backward slices do not depend on their control-dependent instructions or have only a short dependence. We propose control-flow decoupling (CFD) to eradicate mispredictions of separable branches. The idea is to separate the loop containing the branch into two loops: the first contains only the branch's predicate computation and the second contains the branch and its control-dependent instructions. The first loop communicates branch outcomes to the second loop through an architectural queue. Micro architecturally, the queue resides in the fetch unit to drive timely, non-speculative fetching or skipping of successive dynamic instances of the control-dependent region. Either the programmer or compiler can transform a loop for CFD, and we evaluate both. On a micro architecture configured similar to Intel's Sandy Bridge core, CFD increases performance by up to 43%, and reduces energy consumption by up to 41%. Moreover, for some applications, CFD is a necessary catalyst for future complexity-effective large-window architectures to tolerate memory latency.}, journal={2012 IEEE/ACM 45TH INTERNATIONAL SYMPOSIUM ON MICROARCHITECTURE (MICRO-45)}, author={Sheikh, Rami and Tuck, James and Rotenberg, Eric}, year={2012}, pages={329–340} } @inproceedings{sheikh_tuck_rotenberg_2012, title={Control-flow decoupling}, booktitle={2012 45th Annual IEEE/ACM International Symposium on Microarchitecture}, author={Sheikh, Rami and Tuck, James and Rotenberg, Eric}, year={2012}, pages={329–340} } @inproceedings{vanka_tuck_2012, title={Efficient and accurate data dependence profiling using software signatures}, booktitle={Proceedings of the Tenth International Symposium on Code Generation and Optimization}, author={Vanka, Rajeshwar and Tuck, James}, year={2012}, pages={186–195} } @article{patsilaras_choudhary_tuck_2012, title={Efficiently Exploiting Memory Level Parallelism on Asymmetric Coupled Cores in the Dark Silicon Era}, volume={8}, ISSN={["1544-3566"]}, DOI={10.1145/2086696.2086707}, abstractNote={Extracting high memory-level parallelism (MLP) is essential for speeding up single-threaded applications which are memory bound. At the same time, the projected amount of dark silicon (the fraction of the chip powered off) on a chip is growing. Hence, Asymmetric Multicore Processors (AMP) offer a unique opportunity to integrate many types of cores, each powered at different times, in order to optimize for different regions of execution. In this work, we quantify the potential for exploiting core customization to speedup programs during regions of high MLP. Based on a careful design space exploration, we discover that an AMP that includes a narrow and fast specialized core has the potential to efficiently exploit MLP.}, number={4}, journal={ACM TRANSACTIONS ON ARCHITECTURE AND CODE OPTIMIZATION}, publisher={ACM New York, NY, USA}, author={Patsilaras, George and Choudhary, Niket K. and Tuck, James}, year={2012}, month={Jan} } @inproceedings{han_jiang_liu_wu_tuck_2012, title={HiRe: using hint & release to improve synchronization of speculative threads}, booktitle={Proceedings of the 26th ACM international conference on Supercomputing}, author={Han, Liang and Jiang, Xiaowei and Liu, Wei and Wu, Youfeng and Tuck, James}, year={2012}, pages={143–152} } @article{patsilaras_choudhary_tuck_2011, title={Article 28 (21 pages)-Efficiently Exploiting Memory Level Parallelism on Asymmetric Coupled Cores in the Dark Silicon Era}, volume={8}, number={4}, journal={ACM Transactions on Architecture and Code Optimization-TACO}, author={Patsilaras, G and Choudhary, NK and Tuck, J}, year={2011} } @inproceedings{lee_danis_tuck_2011, title={AutoPipe: A Pipeline Parallelization Framework in GCC}, booktitle={GROW2011: International Workshop on GCC Research Opportunities}, author={Lee, S. and Danis, J. and Tuck, J.}, year={2011} } @inproceedings{lee_tuck_2011, title={Automatic Parallelization of Fine-grained Meta-functions on a Chip Multiprocessor}, DOI={10.1109/cgo.2011.5764681}, abstractNote={Due to the importance of reliability and security, prior studies have proposed inlining meta-functions into applications for detecting bugs and security vulnerabilities. However, because these software techniques add frequent, finegrained instrumentation to programs, they often incur large runtime overheads. In this work, we consider an automatic thread extraction technique for removing these fine-grained checks from a main application and scheduling them on helper threads. In this way, we can leverage the resources available on a CMP to reduce the latency and overhead of fine-grained checking codes. Our parallelization strategy automatically extracts meta-functions from the main application and executes them in customized helper threads — threads constructed to mirror relevant fragments of the main program's behavior in order to keep communication and overhead low. To get good performance, we consider optimizations that reduce communication and balance work among many threads. We evaluate our parallelization strategy on Mudflap, a pointer-use checking tool in GCC. To show the benefits of our technique, we compare it to a manually parallelized version of Mudflap. We run our experiments on an architectural simulator with support for fast queueing operations. On a subset of SPECint 2000, our automatically parallelized code is only 29% slower, on average, than the manually parallelized version on a simulated 8-core system. Furthermore, two applications achieve better speedups using our algorithms than with the manual approach. Also, our approach introduces very little overhead in the main program — it is kept under 100%, which is more than a 5.3× reduction compared to serial Mudflap.}, booktitle={International Symposium on Code Generation and Optimization}, author={Lee, S. and Tuck, James}, year={2011}, pages={130–140} } @inproceedings{lee_tiwari_yan_tuck_2011, title={HAQu: Hardware-accelerated queueing for fine-grained threading on a chip multiprocessor}, DOI={10.1109/hpca.2011.5749720}, abstractNote={Queues are commonly used in multithreaded programs for synchronization and communication. However, because software queues tend to be too expensive to support finegrained parallelism, hardware queues have been proposed to reduce overhead of communication between cores. Hardware queues require modifications to the processor core and need a custom interconnect. They also pose difficulties for the operating system because their state must be preserved across context switches. To solve these problems, we propose a hardware-accelerated queue, or HAQu. HAQu adds hardware to a CMP that accelerates operations on software queues. Our design implements fast queueing through an application's address space with operations that are compatible with a fully software queue. Our design provides accelerated and OS-transparent performance in three general ways: (1) it provides a single instruction for enqueueing and dequeueing which significantly reduces the overhead when used in fine-grained threading; (2) operations on the queue are designed to leverage low-level details of the coherence protocol; and (3) hardware ensures that the full state of the queue is stored in the application's address space, thereby ensuring virtualization. We have evaluated our design in the context of application domains: offloading fine-grained checks for improved software reliability, and automatic, fine-grained parallelization using decoupled software pipelining.}, booktitle={International symposium on high-performance computer}, author={Lee, S. and Tiwari, D. and Yan, S. H. and Tuck, J.}, year={2011}, pages={99–110} } @inproceedings{lee_tiwari_solihin_tuck_2011, title={HAQu: Hardware-accelerated queueing for fine-grained threading on a chip multiprocessor}, booktitle={2011 IEEE 17th International Symposium on High Performance Computer Architecture}, author={Lee, Sanghoon and Tiwari, Devesh and Solihin, Yan and Tuck, James}, year={2011}, pages={99–110} } @article{stenström_de bosschere_albericio_gran_ibáñez_viñals_llaberı́a jm_bayrak_velickovic_ienne_et al._2011, title={SPECIAL ISSUE ON HIGH-PERFORMANCE AND EMBEDDED ARCHITECTURES AND COMPILERS}, volume={8}, number={4}, journal={ACM Transactions on}, author={Stenström, P and De Bosschere, K and Albericio, J and Gran, R and Ibáñez, P and Viñals, V and LLaberı́a JM and Bayrak, AG and Velickovic, N and Ienne, P and et al.}, year={2011} } @inproceedings{patsilaras_choudhary_tuck_2010, title={Design Trade-offs for Memory Level Parallelism on an Asymmetric Multicore System}, booktitle={Pespma 2010-Workshop on Parallel Execution of Sequential Programs on Multi-core Architecture}, author={Patsilaras, George and Choudhary, Niket K and Tuck, James}, year={2010} } @inproceedings{patsilaras_choudhary_tuck_2010, title={Design Tradeoffs for Memory-Level Parallelism on an Asymmetric Multicore System}, booktitle={Workshop on Parallel Execution of Sequential Programs on Multi-core Architectures}, author={Patsilaras, G. and Choudhary, N. and Tuck, J.}, year={2010} } @inproceedings{tiwari_tuck_solihin_2010, title={MMT: Exploiting Fine Grained Parallelism in Dynamic Memory Management}, DOI={10.1109/ipdps.2010.5470428}, abstractNote={Dynamic memory management is one of the most expensive but ubiquitous operations in many operations in many C/C++ applications. Additional features such as security checks, while desirable, further worsen memory management overheads. With advent of multicore architecture, it is important to investigate how dynamic memory management overheads for sequential applications can be reduced. In this paper, we propose a new approach for accelerating dynamic memory management on multicore architecture, by offloading dynamic management functions to a separate thread that we refer to as memory management thread (MMT). We show that an efficient MMT design can give significant performance improvement by extracting parallelism while being agnostic to the underlying memory management library algorithms and data structures. We also show how parallelism provided by MMT can be beneficial for high overhead memory management tasks, for example, security checks related to memory management. We evaluate MMT on heap allocation-intensive benchmarks running on an Intel core 2 quad platform for two widely-used memory allocators: Doug Lea's and PHKmalloc allocators. On average, MMT achieves a speedup ratio of 1.19× for both allocators, while both the application and memory management libraries are unmodified and are oblivious to the parallelization scheme. For PHKmalloc with security checks turned on, MMT reduces the security check overheads from 21% to only 1% on average.}, booktitle={International Parallel and Distributed Processing Symposium}, author={Tiwari, D. and Tuck, J. and Solihin, Y.}, year={2010} } @inproceedings{tiwari_lee_tuck_solihin_2010, title={Mmt: Exploiting fine-grained parallelism in dynamic memory management}, booktitle={2010 IEEE International Symposium on Parallel & Distributed Processing (IPDPS)}, author={Tiwari, Devesh and Lee, Sanghoon and Tuck, James and Solihin, Yan}, year={2010}, pages={1–12} } @inproceedings{han_liu_tuck_2010, title={Speculative parallelization of partial reduction variables}, DOI={10.1145/1772954.1772975}, abstractNote={Reduction variables are an important class of cross-thread dependence that can be parallelized by exploiting the associativity and commutativity of their operation. In this paper, we define a class of shared variables called partial reduction variables (PRV). These variables either cannot be proven to be reductions or they violate the requirements of a reduction variable in some way. We describe an algorithm that allows the compiler to detect PRVs, and we also discuss the necessary requirements to parallelize detected PRVs. Based on these requirements, we propose an implementation in a TLS system to parallelize PRVs that works by a combination of techniques at compile time and in the hardware. The compiler transforms the variable under the assumption that the reduction-like behavior proven statically will hold true at runtime. However, if a thread reads or updates the shared variable as a result of an alias or unlikely control path, a lightweight hardware mechanism will detect the access and synchronize it to ensure correct execution. We implement our compiler analysis and transformation in GCC, and analyze its potential on the SPEC CPU 2000 benchmarks.We find that supporting PRVs provides up to 46% performance gain over a highly optimized TLS system and on average 10.7% performance improvement.}, booktitle={Proceedings of the 8th annual IEEE/ACM international symposium on Code generation and optimization}, author={Han, Liang and Liu, Wei and Tuck, James M}, year={2010}, pages={141–150} } @inproceedings{tiwari_lee_tuck_solihin_2009, title={Memory management thread for heap allocation intensive sequential applications}, DOI={10.1145/1621960.1621967}, abstractNote={Dynamic memory management is one of the most ubiquitous and expensive operations in many C/C++ applications. Some C/C++ programs might spend up to one third of their execution time in dynamic memory management routines. With multicore processors as a mainstream architecture, it is important to investigate how dynamic memory management can exploit the multi-core parallelism for speeding up sequential programs. In this paper, we propose a way for exploiting multicore parallelism in dynamic memory management for sequential applications, by spinning off memory allocation and deallocation functions to a separate thread that we refer to as memory management thread (MMT). The goal of this study is to show how an efficient design and implementation of MMT can give performance without any algorithm or implementation level knowledge of underlying memory management library being offloaded. Using heap allocation-intensive benchmarks, we evaluate MMT on an Intel Core 2 Quad platform for widely used Doug Lea's memory allocator. Without any modifications to application source-code or memory management algorithm of underlying memory allocators, our MMT approach achieves an average speedup ratio of 1.19x, and 1.60x in the best case.}, booktitle={Proceedings of the 10th workshop on MEmory performance: DEaling with Applications, systems and architecture}, author={Tiwari, Devesh and Lee, Sanghoon and Tuck, James and Solihin, Yan}, year={2009}, pages={35–42} } @article{tuck_ahn_torrellas_ceze_2009, title={SOFTSIG: SOFTWARE-EXPOSED HARDWARE SIGNATURES FOR CODE ANALYSIS AND OPTIMIZATION}, volume={29}, ISSN={["1937-4143"]}, DOI={10.1109/MM.2009.15}, abstractNote={Many code analysis techniques for optimization, debugging, and parallelization must perform runtime disambiguation of address sets. Hardware signatures support such operations efficiently and with low complexity. SoftSig exposes hardware signatures to software through instructions that control which addresses to collect and which to disambiguate against. The Memoise algorithm demonstrates SoftSig's versatility by detecting and eliminating redundant function calls.}, number={1}, journal={IEEE MICRO}, publisher={IEEE}, author={Tuck, James and Ahn, Wonsun and Torrellas, Josep and Ceze, Luis}, year={2009}, pages={84–95} } @article{torrellas_ceze_tuck_cascaval_montesinos_ahn_prvulovic_2009, title={The Bulk Multicore Architecture for Improved Programmability}, volume={52}, ISSN={["1557-7317"]}, DOI={10.1145/1610252.1610271}, abstractNote={Easing the programmer's burden does not compromise system performance or increase the complexity of hardware implementation.}, number={12}, journal={COMMUNICATIONS OF THE ACM}, publisher={ACM New York, NY, USA}, author={Torrellas, Josep and Ceze, Luis and Tuck, James and Cascaval, Calin and Montesinos, Pablo and Ahn, Wonsun and Prvulovic, Milos}, year={2009}, month={Dec}, pages={58–65} } @book{gopal_tuck_2008, title={A Data Dependence Profiler for the GNU Compiler Collection}, journal={Technical Report- Not held in TRLN member libraries}, author={Gopal, S. and Tuck, J.}, year={2008} } @inproceedings{lee_tuck_2008, title={Parallelizing Mudflap Using Thread-Level Speculation on a CMP}, booktitle={Workshop on Parallel Execution of Sequential Programs on Multi-core Architectures}, author={Lee, S. and Tuck, J.}, year={2008} } @inproceedings{lee_tuck_2008, title={Parallelizing Mudflap using Thread-Level Speculation on a Chip Multiprocessor}, booktitle={Proc. of the 2008 Workshop on Parallel Execution of Sequential Programs on Multicore Architectures}, author={Lee, Sanghoon and Tuck, James}, year={2008}, pages={72–80} } @inproceedings{tuck_ahn_ceze_torrellas_2008, title={SoftSig: Software Exposed Hardware Signatures for Code Analysis and Optimization}, DOI={10.1145/1346281.1346300}, abstractNote={Many code analysis techniques for optimization, debugging, and parallelization must perform runtime disambiguation of address sets. Hardware signatures support such operations efficiently and with low complexity. SoftSig exposes hardware signatures to software through instructions that control which addresses to collect and which to disambiguate against. The Memoise algorithm demonstrates SoftSig's versatility by detecting and eliminating redundant function calls.}, booktitle={ACM International Symposium on Architectural Support for Programming Languages and Operating Systems}, author={Tuck, J. and Ahn, W. and Ceze, L. and Torrellas, J.}, year={2008}, pages={145–156} } @article{tuck_ahn_ceze_torrellas_2008, title={SoftSig: software-exposed hardware signatures for code analysis and optimization}, volume={42}, number={2}, journal={ACM SIGOPS Operating Systems Review}, publisher={ACM New York, NY, USA}, author={Tuck, James and Ahn, Wonsun and Ceze, Luis and Torrellas, Josep}, year={2008}, pages={145–156} } @article{system and method for cache coherency in a cache with different cache location lengths_2008, note={US Patent 7,454,576}, year={2008}, month={Nov} } @inproceedings{ceze_tuck_montesinos_torrellas_2007, title={BulkSC: Bulk enforcement of sequential consistency}, DOI={10.1145/1250662.1250697}, abstractNote={While Sequential Consistency (SC) is the most intuitive memory consistency model and the one most programmers likely assume, current multiprocessors do not support it. Instead, they support more relaxed models that deliver high performance. SC implementations are considered either too slow or -- when they can match the performance of relaxed models -- too difficult to implement. In this paper, we propose Bulk Enforcement of SC (BulkSC), anovel way of providing SC that is simple to implement and offers performance comparable to Release Consistency (RC). The idea is to dynamically group sets of consecutive instructions into chunks that appear to execute atomically and in isolation. The hardware enforces SC at the coarse grain of chunks which, to the program, appears as providing SC at the individual memory access level. BulkSC keeps the implementation simple by largely decoupling memory consistency enforcement from processor structures. Moreover, it delivers high performance by enabling full memory access reordering and overlapping within chunks and across chunks. We describe a complete system architecture that supports BulkSC and show that it delivers performance comparable to RC.}, booktitle={Proceedings of the 34th annual international symposium on Computer architecture}, author={Ceze, Luis and Tuck, James and Montesinos, Pablo and Torrellas, Josep}, year={2007}, pages={278–289} } @inproceedings{tuck_liu_torrellas_2007, title={CAP: Criticality analysis for power-efficient speculative multithreading}, DOI={10.1109/iccd.2007.4601932}, abstractNote={While speculative multithreading (SM) on a chip multiprocessor (CMP) has the ability to speed-up hard-to- parallelize applications, the power inefficiency of aggressive speculation is a concern. To improve SMs power effeciency, we note that not all the tasks that are running in a SM environment are equally critical. To leverage this insight, this paper develops a novel, widely-applicable task-criticality model for SM. It also proposes CAP, a novel architecture that builds a task-criticality graph dynamically and uses it to make scheduling decisions in a SM CMP. Experiments with SPECint, SPECfp, and Olden applications show that, in a CMP with one fast core and three slow ones, the E D2 with CAP is, on average, 91-95% of that without. Moreover, it is only 77-91% of the E D2 of a CMP with four fast cores and no CAP. Overall, we argue that scheduling for task criticality is beneficial.}, booktitle={2007 25th International Conference on Computer Design}, author={Tuck, James and Liu, Wei and Torrellas, Josep}, year={2007}, pages={409–416} } @book{tuck_2007, title={Efficient support for speculative tasking}, author={Tuck, James M}, year={2007} } @inproceedings{ceze_tuck_torrellas_2006, title={Are We Ready for High Memory-Level Parallelism?}, booktitle={Workshop on Memory Performance Issues}, author={Ceze, L. and Tuck, J. and Torrellas, J.}, year={2006} } @inproceedings{ceze_tuck_torrellas_2006, title={Are we ready for high memorylevel parallelism?}, booktitle={4th Workshop on Memory Performance Issues}, author={Ceze, Luis and Tuck, James and Torrellas, Josep}, year={2006} } @article{wang_ji_hu_2006, title={Boosting SMT trace processors performance with data cache misssensitive thread scheduling mechanism}, volume={30}, number={5}, journal={Microprocessors and Microsystems}, publisher={Elsevier}, author={Wang, Kai-feng and Ji, Zhen-zhou and Hu, Ming-zeng}, year={2006}, pages={225–233} } @inproceedings{ceze_tuck_cascaval_torrellas_2006, title={Bulk Disambiguation of Speculative Threads in Multiprocessors}, booktitle={IEEE/ACM Annual International Symposium on Computer Architecture}, author={Ceze, L. and Tuck, J. and Cascaval, C. and Torrellas, J.}, year={2006}, pages={227–238} } @article{ceze_tuck_torrellas_cascaval_2006, title={Bulk disambiguation of speculative threads in multiprocessors}, volume={34}, number={2}, journal={ACM SIGARCH Computer Architecture News}, publisher={ACM New York, NY, USA}, author={Ceze, Luis and Tuck, James and Torrellas, Josep and Cascaval, Calin}, year={2006}, pages={227–238} } @article{ceze_strauss_tuck_torrellas_renau_2006, title={CAVA: Using checkpoint-assisted value prediction to hide L2 misses}, volume={3}, DOI={10.1145/1138035.1138038}, abstractNote={Modern superscalar processors often suffer long stalls because of load misses in on-chip L2 caches. To address this problem, we propose hiding L2 misses with Checkpoint-Assisted VAlue prediction (CAVA). On an L2 cache miss, a predicted value is returned to the processor. When the missing load finally reaches the head of the ROB, the processor checkpoints its state, retires the load, and speculatively uses the predicted value and continues execution. When the value in memory arrives at the L2 cache, it is compared to the predicted value. If the prediction was correct, speculation has succeeded and execution continues; otherwise, execution is rolled back and restarted from the checkpoint. CAVA uses fast checkpointing, speculative buffering, and a modest-sized value prediction structure that has about 50% accuracy. Compared to an aggressive superscalar processor, CAVA speeds up execution by up to 1.45 for SPECint applications and 1.58 for SPECfp applications, with a geometric mean of 1.14 for SPECint and 1.34 for SPECfp applications. We also evaluate an implementation of Runahead execution---a previously proposed scheme that does not perform value prediction and discards all work done between checkpoint and data reception from memory. Runahead execution speeds up execution by a geometric mean of 1.07 for SPECint and 1.18 for SPECfp applications, compared to the same baseline.}, number={2}, journal={ACM Transactions on Architecture and Code Optimization (TACO)}, publisher={ACM}, author={Ceze, Luis and Strauss, Karin and Tuck, James and Torrellas, Josep and Renau, Jose}, year={2006}, pages={182–208} } @article{nenau_strauss_ceze_liu_sarangi_tuck_torrellas_2006, title={Energy-Efficient Thread-Level Speculation on a CMP}, journal={IEEE Micro Special Issue: Micro's Top Picks from Computer Architecture Conferences}, author={Nenau, J. and Strauss, K. and Ceze, L. and Liu, W. and Sarangi, S. and Tuck, J. and Torrellas, J.}, year={2006}, pages={80–91} } @article{renau_strauss_ceze_liu_sarangi_tuck_torrellas_2006, title={Energy-efficient thread-level speculation}, volume={26}, number={1}, journal={IEEE Micro}, publisher={IEEE}, author={Renau, Jose and Strauss, Karin and Ceze, Luis and Liu, Wei and Sarangi, Smruti R and Tuck, James and Torrellas, Josep}, year={2006}, pages={80–91} } @inproceedings{liu_tuck_ceze_ahn_strauss_renau_torrellas_2006, title={POSH: a TLS compiler that exploits program structure}, DOI={10.1145/1122971.1122997}, abstractNote={As multi-core architectures with Thread-Level Speculation (TLS) are becoming better understood, it is important to focus on TLS compilation. TLS compilers are interesting in that, while they do not need to fully prove the independence of concurrent tasks, they make choices of where and when to generate speculative tasks that are crucial to overall TLS performance.This paper presents POSH, a new, fully automated TLS compiler built on top of gcc. POSH is based on two design decisions. First, to partition the code into tasks, it leverages the code structures created by the programmer, namely subroutines and loops. Second, it uses a simple profiling pass to discard ineffective tasks. With the code generated by POSH, a simulated TLS chip multiprocessor with 4 superscalar cores delivers an average speedup of 1.30 for the SPECint 2000 applications. Moreover, an estimated 26% of this speedup is a result of the implicit data prefetching provided by squashed tasks.}, booktitle={Proceedings of the eleventh ACM SIGPLAN symposium on Principles and practice of parallel programming}, author={Liu, Wei and Tuck, James and Ceze, Luis and Ahn, Wonsun and Strauss, Karin and Renau, Jose and Torrellas, Josep}, year={2006}, pages={158–167} } @inproceedings{tuck_ceze_torrellas_2006, title={Scalable cache miss handling for high memory-level parallelism}, DOI={10.1109/micro.2006.44}, abstractNote={Recently-proposed processor microarchitectures for high memory level parallelism (MLP) promise substantial performance gains. Unfortunately, current cache hierarchies have miss-handling architectures (MHAs) that are too limited to support the required MLP - they need to be redesigned to support 1-2 orders of magnitude more outstanding misses. Yet, designing scalable MHAs is challenging: designs must minimize cache lock-up time and deliver high bandwidth while keeping the area consumption reasonable. This paper presents a novel scalable MHA design for high-MLP processors. Our design introduces two main innovations. First, it is hierarchical, with a small MSHR file per cache bank, and a larger MSHR file shared by all banks. Second, it uses a Bloom filter to reduce searches in the larger MSHR file. The result is a high-performance, area-efficient design. Compared to a state-of-the-art MHA on a high-MLP processor, our design speeds-up some SPECint, SPECfp, and multiprogrammed workloads by a geometric mean of 32%, 50%, and 95%, respectively. Moreover, compared to two extrapolations of current MHA designs, namely a large monolithic MSHR file and a large banked MSHR file, all consuming the same area, our design speeds-up the workloads by a geometric mean of 1-18% and 10-21%, respectively. Finally, our design performs very close to an unlimited-size, ideal MHA}, booktitle={2006 39th Annual IEEE/ACM International Symposium on Microarchitecture (MICRO'06)}, author={Tuck, James and Ceze, Luis and Torrellas, Josep}, year={2006}, pages={409–422} } @article{hughes_tuck_lee_chen_2006, title={System and method for non-uniform cache in a multi-core processor}, note={US Patent App. 11/023,925}, author={Hughes, Christopher and Tuck, James and Lee, Victor and Chen, Yen-kuang}, year={2006}, month={Jun} } @article{tseng_2005, title={Languages and compilers for parallel computing}, author={Tseng, Bill Pugh Chau-Wen}, year={2005} } @inproceedings{liu_tuck_ceze_strauss_renau_torrellas_2005, title={POSH: A profiler-enhanced TLS compiler that leverages program structure}, booktitle={IBM Watson P= AC2 Conference}, author={Liu, Wei and Tuck, James and Ceze, Luis and Strauss, Karin and Renau, Jose and Torrellas, Josep}, year={2005}, pages={83–92} } @article{renau_fraguela_tuck_liu_prvulovic_ceze_sarangi_sack_strauss_montesinos_2005, title={SESC simulator, January 2005}, author={Renau, Jose and Fraguela, Basilio and Tuck, James and Liu, Wei and Prvulovic, Milos and Ceze, Luis and Sarangi, Smruti and Sack, Paul and Strauss, Karin and Montesinos, Pablo}, year={2005} } @inproceedings{renau_j._l._l._k._torrellas_2005, title={Tasking with Out-of-Order Spawn in TLS Chip Multiprocessors: Microarchitecture and Compilation}, DOI={10.1145/1088149.1088173}, abstractNote={Chip Multiprocessors (CMPs) are flexible, high-frequency platforms on which to support Thread-Level Speculation (TLS). However, for TLS to deliver on its promise, CMPs must exploit multiple sources of speculative task-level parallelism, including any nesting levels of both subroutines and loop iterations. Unfortunately, these environments are hard to support in decentralized CMP hardware: since tasks are spawned out-of-order and unpredictably, maintaining key TLS basics such as task ordering and efficient resource allocation is challenging.While the concept of out-of-order spawning is not new, this paper is the first to propose a set of microarchitectural mechanisms that, altogether, fundamentally enable fast TLS with out-of-order spawn in a CMP. Moreover, we develop a fully-automated TLS compiler for aggressive out-of-order spawn. With our mechanisms, a TLS CMP with four 4-issue cores achieves an average speedup of 1.30 for full SPECint 2000 applications; the corresponding speedup for in-order only spawn is 1.04. Overall, our mechanisms unlock the potential of TLS for the toughest applications.}, booktitle={ACM International Conference on Supercomputing}, author={Renau, J. Tuck and J., Wei and L., Ceze and L., Strauss and K. and Torrellas, J.}, year={2005}, pages={179–188} } @inproceedings{renau_tuck_liu_ceze_strauss_torrellas_2005, title={Tasking with out-of-order spawn in TLS chip multiprocessors: Microarchitecture and compilation}, booktitle={Proceedings of the 19th Annual International conference on Supercomputing}, author={Renau, Jose and Tuck, James and Liu, Wei and Ceze, Luis and Strauss, Karin and Torrellas, Josep}, year={2005}, pages={179–188} } @inproceedings{renau_strauss_ceze_liu_sarangi_tuck_torrellas_2005, title={Thread-level speculation on a CMP can be energy efficient}, DOI={10.1145/1088149.1088178}, abstractNote={Chip Multiprocessors (CMP) with Thread-Level Speculation (TLS) have become the subject of intense research. However, TLS is suspected of being too energy inefficient to compete against conventional processors. In this paper, we refute this claim. To do so, we first identify the main sources of dynamic energy consumption in TLS. Then, we present simple energy-saving optimizations that cut the energy cost of TLS by over 60% on average with minimal performance impact. The resulting TLS CMP, populated with four 3-issue cores, speeds-up full SPECint 2000 codes by 1.27 on average, while keeping the fraction of the chip's energy consumption due to TLS to only 20%. Compared to a 6-issue superscalar at the same frequency, the TLS CMP is on average faster, while consuming only 85% of its total on-chip power.}, booktitle={Proceedings of the 19th annual international conference on Supercomputing}, author={Renau, Jose and Strauss, Karin and Ceze, Luis and Liu, Wei and Sarangi, Smruti and Tuck, James and Torrellas, Josep}, year={2005}, pages={219–228} } @article{ceze_strauss_tuck_torrellas_2004, title={CAVA: Hiding L2 Misses with Checkpoint-Assisted Value Prediction}, journal={IEEE Computer Architecture Letters}, author={Ceze, L. and Strauss, K. and Tuck, J. and Torrellas, J.}, year={2004}, pages={7–10} } @article{ceze_strauss_tuck_renau_torrellas_2004, title={CAVA: Hiding L2 misses with checkpoint-assisted value prediction}, volume={3}, number={1}, journal={IEEE Computer Architecture Letters}, publisher={IEEE}, author={Ceze, Luis and Strauss, Karin and Tuck, James and Renau, Jose and Torrellas, Josep}, year={2004}, pages={7–7} } @phdthesis{tuck_2003, title={A novel compiler framework for a chip-multiprocessor architecture with thread-level speculation}, school={University of Illinois at Urbana-Champaign}, author={Tuck, James Murray}, year={2003} } @article{renau_tuck_liu_torrellas_2002, title={Morphable multithreaded memory tiles (M3T) architecture}, journal={University of Illinois UIUC-CS Technical Report}, author={Renau, Jose and Tuck, James and Liu, Wei and Torrellas, Josep}, year={2002} } @book{tuck_baugh_renau_torrellas_2002, title={Sphinx Parallelization}, author={Tuck, James M and Baugh, Lee W and Renau, Jose and Torrellas, Josep}, year={2002} } @article{baugh_renau_tuck_torrellas_2002, title={Sphinx parallelization}, journal={Dept. of Computer Science, University of Illinois, Tech. Rep. UIUCDCS}, author={Baugh, Lee and Renau, Jose and Tuck, James and Torrellas, Josep}, year={2002} } @article{gray_bapty_neema_tuck_2001, title={Handling crosscutting constraints in domain-specific modeling}, volume={44}, number={10}, journal={Communications of the ACM}, publisher={ACM New York, NY, USA}, author={Gray, Jeff and Bapty, Ted and Neema, Sandeep and Tuck, James}, year={2001}, pages={87–93} } @article{gray_bapty_neema_tuck_2001, title={Handling crosscutting constraints in domain-specific modeling - Uniting AOP with model-integrated computing.}, volume={44}, ISSN={["0001-0782"]}, DOI={10.1145/383845.383864}, abstractNote={An Aspect-Oriented (AO) approach can be beneficial at different stages of the software lifecycle and at various levels of abstraction. Whenever the description of a software artifact exhibits crosscutting structure, the principles of modularity espoused by AO offer a powerful technology for supporting separation of concerns. We have found this to be true especially in the area of domain-specific modeling [3].}, number={10}, journal={COMMUNICATIONS OF THE ACM}, author={Gray, J and Bapty, T and Neema, S and Tuck, J}, year={2001}, month={Oct}, pages={87–93} } @article{tuck_bapty_2001, title={Institute for Software Integrated Systems Vanderbilt University Nashville Tennessee 37235}, volume={1}, journal={ISIS}, publisher={Citeseer}, author={Tuck, James and Bapty, Ted}, year={2001}, pages={200} } @article{patsilaras_lee_tuck, title={Abstract Parallel Operators: Revamping the Hardware/Software Interface for the Multicore Era}, author={Patsilaras, George and Lee, Sanghoon and Tuck, James} } @article{vanka_tuck, title={Improving MemoiSE Using Function Splitting}, publisher={Citeseer}, author={Vanka, Rajesh and Tuck, James} } @book{improving memoise via function splitting, volume={2009}, journal={Technical Report- Not held in TRLN member libraries} } @article{akin_baghsorkhi_bai_fletcher_healy_huang_kayiran_khan_li_litz_et al., title={Industry Session Program Committee}, author={Akin, Berkin and Baghsorkhi, Sara and Bai, Yuxin and Fletcher, Chris and Healy, Michael and Huang, Michael and Kayiran, Onur and Khan, Samira and Li, Helen and Litz, Heiner and et al.} } @article{tuck_torrellas, title={Tasking with out-of-order spawn in TLS chip multiprocessors}, publisher={Citeseer}, author={Tuck, James and Torrellas, Josep} } @article{strauss_ceze_liu_sarangi_tuck_torrellas, title={Thread-Level Speculation on a CMP Can Be Energy Efficient}, publisher={Citeseer}, author={Strauss, Jose Renau Karin and Ceze, Luis and Liu, Wei and Sarangi, Smruti and Tuck, James and Torrellas, Josep} }