@article{abdullah_lee_zhou_awad_2024, title={Salus: Efficient Security Support for CXL-Expanded GPU Memory}, ISBN={["979-8-3503-9314-9"]}, ISSN={["1530-0897"]}, DOI={10.1109/HPCA57654.2024.00027}, abstractNote={GPUs have become indispensable accelerators for many data-intensive applications such as scientific workloads, deep learning models, and graph analytics; these applications share a common demand for increasingly large memory. As the memory capacity connected through traditional memory interfaces is reaching limits, heterogeneous memory systems have gained traction in expanding the memory pool. These systems involve dynamic data movement between different memory locations for efficient utilization, which poses challenges for existing security implementations, whose metadata are tied to the physical location of data. In this work, we propose a new security model specifically designed for systems with dynamic page migration. Our model minimizes the need for security recalculations due to data movement, optimizes security structures for efficient bandwidth utilization, and reduces the overall traffic caused by security operations. Based on our evaluation, our proposed security support improves the GPU throughput by a geometric mean of 29.94% (up to 190.43%) over the conventional security model, and it reduces the security traffic in the memory subsystem to 47.79% on average (as low as 17.71% overhead).}, journal={2024 IEEE INTERNATIONAL SYMPOSIUM ON HIGH-PERFORMANCE COMPUTER ARCHITECTURE, HPCA 2024}, author={Abdullah, Rahaf and Lee, Hyokeun and Zhou, Huiyang and Awad, Amro}, year={2024}, pages={233–248} } @article{jang_kim_lee_2023, title={A Spatio-Temporal Switchable Data Prefetcher for Convolutional Neural Networks}, ISSN={["2163-9612"]}, DOI={10.1109/ISOCC59558.2023.10396344}, abstractNote={In this paper, we propose a spatio-temporal switchable data prefetcher that can adapt to the locality characteristics of CNN models. The proposed prefetcher records the recent delta history by leveraging two tables. The first table predicts spatial address patterns by comparing the delta score with the last delta, while the second table predicts temporal address patterns by recording and reordering the delta sequence from the delta history. Consequently, the proposed prefetcher is capable of appropriately switching between these two prediction methodologies based on spatial and temporal localities. The experimental results on CNN inference workloads show that we achieved high average accuracy of 83.8% and coverage of 81.6%, and hence the proposed prefetcher improves system performance by 33.8% over a baseline with no data prefetcher and 21% over the best-performing prior spatio-temporal prefetcher.}, journal={2023 20TH INTERNATIONAL SOC DESIGN CONFERENCE, ISOCC}, author={Jang, Jihoon and Kim, Hyun and Lee, Hyokeun}, year={2023}, pages={141–142} } @article{alam_lee_bhattacharjee_awad_2023, title={CryptoMMU: Enabling Scalable and Secure Access Control of Third-Party Accelerators}, DOI={10.1145/3613424.3614311}, abstractNote={Due to increasing energy and performance gaps between general-purpose processors and hardware accelerators (e.g., FPGA or ASIC), clear trends for leveraging accelerators arise in various fields or workloads, such as edge devices, cloud systems, and data centers. Moreover, system integrators desire higher flexibility to deploy custom accelerators based on their performance, power, and cost constraints, where such integration can be as early as (1) at the design time when third-party intellectual properties (IPs) are used, (2) at integration/upgrade time when third-party discrete chip accelerators are used, or (3) during runtime as in reconfigurable logic.A malicious third-party accelerator can compromise the entire system by accessing other processes’ data, overwriting OS data structures, etc. To eliminate these security ramifications, a unit similar to a memory management unit (MMU), namely IOMMU, is typically used to scrutinize memory accesses from I/O devices, including accelerators. Still, IOMMU incurs significant performance overhead because it resides on the critical path of each I/O memory access. In this paper, we propose a novel scheme, CryptoMMU, to delegate the translation processes to accelerators, whereas the authentication of the targeted address is elegantly performed using a cryptography-based approach. As a result, CryptoMMU facilitates the private caching of translation in each accelerator, providing better scalability. Our evaluation results show that CryptoMMU improves system throughput by an average of 2.97× and 1.13× compared to the conventional IOMMU and the state-of-the-art solution, respectively. Importantly, CryptoMMU can be implemented without any software changes.CCS CONCEPTS• Security and privacy → Security in hardware; • Hardware → Very large scale integration design.}, journal={56TH IEEE/ACM INTERNATIONAL SYMPOSIUM ON MICROARCHITECTURE, MICRO 2023}, author={Alam, Faiz and Lee, Hyokeun and Bhattacharjee, Abhishek and Awad, Amro}, year={2023}, pages={32–48} } @article{nema_chunduru_kodigal_voskuilen_rodrigues_hemmert_feinberg_lee_awad_hughes_2023, title={ERAS: A Flexible and Scalable Framework for Seamless Integration of RTL Models with Structural Simulation Toolkit}, DOI={10.1109/IISWC59245.2023.00038}, abstractNote={The prevalence of custom Intellectual Properties (IPs) poses challenges for assessing their system-level performance and functional behavior. Register Transfer Level (RTL) simulation requires RTL-level integration with the rest of the system which is time- and resource-intensive. Similarly, developing functional and performance models of the IP requires considerable effort and expertise.This work proposes a framework, ERAS, that enables seamless integration of RTL IP models with high-level architectural simulators, such as Structural Simulation Toolkit (SST). The effectiveness of this framework is demonstrated through architectural exploration using a RISCV processor. Further, ERAS leverages SST’s multi-thread support to enhance simulation speed, effectively overcoming a key bottleneck of detailed RTL simulation. Evaluation with a dual-core RISCV-RTL configuration shows 1.83× simulation speed improvement compared to a serial simulation in gem5 as baseline.ERAS is now part of SST public repository: Link}, journal={2023 IEEE INTERNATIONAL SYMPOSIUM ON WORKLOAD CHARACTERIZATION, IISWC}, author={Nema, Shubham and Chunduru, Shiva Kaushik and Kodigal, Charan and Voskuilen, Gwendolyn and Rodrigues, Arun F. and Hemmert, Scott and Feinberg, Ben and Lee, Hyokeun and Awad, Amro and Hughes, Clayton}, year={2023}, pages={196–200} } @article{lee_choi_lee_sim_2023, title={SDM: Sharing-enabled Disaggregated Memory System with Cache Coherent Compute Express Link}, ISSN={["1089-795X"]}, DOI={10.1109/PACT58117.2023.00016}, abstractNote={Disaggregated memory has been gaining significant traction as a promising solution for scaling memory capacity and better utilizing memory resources in data centers. However, a disaggregated memory system that can simultaneously achieve high performance and user transparency is still not available. Although some modern interconnect technologies now feature hardware coherence protocols that can potentially enable data sharing among multiple computing nodes in a user-transparent manner, naively applying these technologies to disaggregated memory systems results in non-negligible performance overheads. In this work, we propose SDM, a sharing-enabled, cache-coherent disaggregated memory system that effectively utilizes modern interconnect technology. The key design principle of SDM is to implement a novel, dedicated control flow that efficiently enables data sharing among multiple computing nodes without the need to modify user applications, by leveraging the message types defined in the modern memory expansion standard, Compute Express Link (CXL). We also introduce resource management and speculative memory access mechanisms that do not interfere with normal memory transaction channels, thereby further improving the performance of disaggregated memory systems. We evaluate our design based on an in-house simulation framework with detailed analytical models that mimic a cache-coherent multi-node disaggregated memory system. The results show that SDM outperforms the optimized baseline system, which is similar to the one employing CXL 3.0, by $5.77\times$ and $2.65\times$ for two distinctive benchmark suites.}, journal={2023 32ND INTERNATIONAL CONFERENCE ON PARALLEL ARCHITECTURES AND COMPILATION TECHNIQUES, PACT}, author={Lee, Hyokeun and Choi, Kwanseok and Lee, Hyuk-Jae and Sim, Jaewoong}, year={2023}, pages={86–98} }