@article{yuan_awad_yudha_solihin_zhou_2022, title={Adaptive Security Support for Heterogeneous Memory on GPUs}, ISSN={["1530-0897"]}, DOI={10.1109/HPCA53966.2022.00024}, abstractNote={The wide use of accelerators such as GPUs necessities their security support Recent works [17], [33], [34] pointed out that directly adopting the CPU secure memory design to GPUs could incur significant performance overheads due to the memory bandwidth contention between regular data and security metadata. In this paper, we analyze the security guarantees that used to defend against physical attacks, and make the observation that heterogeneous GPU memory system may not always need all the security mechanisms to achieve the security guarantees. Based on the memory types as well as memory access patterns either explicitly specified in the GPU programming model or implicitly detected at run time, we propose adaptive security memory support for heterogeneous memory on GPUs. Specifically, we first identify the read-only data and propose to only use MAC (Message Authentication Code) to protect their integrity. By eliminating the freshness checks on read-only data, we can use an on-chip shared counter for such data regions and remove the corresponding parts in the Bonsai Merkel Tree (BMT), thereby reducing the traffic due to encryption counters and the BMT. Second, we detect the common streaming data access pattern and propose coarse- grain MACs for such stream data to reduce the MAC access bandwidth. With the hardware-based detection of memory type (read-only or not) and memory access patterns (streaming or not), our proposed approach adapts the security support to significantly reduce the performance overhead without sacrificing the security guarantees. Our evaluation shows that our scheme can achieve secure memory on GPUs with low overheads for memory-intensive workloads. Among the fifteen memory-intensive workloads in our evaluation, our design reduces the performance overheads of secure GPU memory from 53.9% to 8.09% on average. Compared to the state-of- the-art secure memory designs for GPU [17], [33], our scheme outperforms PSSM by up to 41.63% and 9.5% on average and outperforms Common counters by 84.04% on average for memory-intensive workloads. We further propose to use the L2 cache as a victim cache for security metadata when the L2 is either underutilized or suffers from very high miss rates, which further reduces the overheads by up to 4% and 0.65% on average.}, journal={2022 IEEE INTERNATIONAL SYMPOSIUM ON HIGH-PERFORMANCE COMPUTER ARCHITECTURE (HPCA 2022)}, author={Yuan, Shougang and Awad, Amro and Yudha, Ardhi Wiratama Baskara and Solihin, Yan and Zhou, Huiyang}, year={2022}, pages={213–228} } @article{yudha_meyer_yuan_zhou_solihin_2022, title={LITE: A Low-Cost Practical Inter-Operable GPU TEE}, DOI={10.1145/3524059.3532361}, abstractNote={There is a strong need for GPU trusted execution environments (TEEs) as GPU is increasingly used in the cloud environment. However, current proposals either ignore memory security (i.e., not encrypting memory) or impose a separate memory encryption domain from the host TEE, causing a very substantial slowdown for communicating data from/to the host. In this paper, we propose a flexible GPU memory encryption design called LITE that relies on software memory encryption aided by small architecture support. LITE's flexibility allows GPU TEE to be co-designed with CPU to create a unified encryption domain. We show that GPU applications can be adapted to the use of LITE encryption APIs without major changes. Through various optimizations, we show that software memory encryption in LITE can produce negligible performance overheads (1.1%) for regular benchmarks and still-acceptable overheads (56%) for irregular benchmarks.}, journal={PROCEEDINGS OF THE 36TH ACM INTERNATIONAL CONFERENCE ON SUPERCOMPUTING, ICS 2022}, author={Yudha, Ardhi Wiratama Baskara and Meyer, Jake and Yuan, Shougang and Zhou, Huiyang and Solihin, Yan}, year={2022} } @article{yuan_yudha_solihin_zhou_2021, title={Analyzing Secure Memory Architecture for GPUs}, DOI={10.1109/ISPASS51385.2021.00017}, abstractNote={Wide adoption of cloud computing makes privacy and security a primary concern. Although recent CPUs have integrated secure memory architecture, such support is still missing for GPUs, a key accelerator in data centers. In this paper, we explore two secure memory architectures, counter-mode encryption and direct encryption, for GPUs and show that we need to architect secure memory differently from it for CPUs. Our in-depth study reveals the following insights. First, as GPUs are designed for high-throughput computation, its secure memory needs to deliver high bandwidth. Second, with counter-mode encryption, the memory traffic resulting from the metadata, i.e., the counters, MACs (message-authentication codes), and integrity tree, may cause significant performance degradation, even in the presence of metadata caches. Third, the sectored cache structure adopted by GPUs leads to multiple sequential accesses to the same metadata cache line, which necessitates the use of MSHRs (miss-status handling registers) for meta-data caches. Fourth, unlike CPUs, separate/partitioned metadata caches perform better than unified metadata caches on GPUs. The reason is that GPU workloads feature streaming accesses, which cause severe contention in the unified metadata cache and the cached counters and integrity tree nodes may be evicted before being reused. Fifth, the massive-threaded nature of GPUs make them latency-tolerant and the performance impact due to the extra encryption/decryption latency is limited. As a result, direct encryption can be a promising alternative for GPU secure memory. The challenge, however, lies in memory integrity verification as the integrity tree may incur high storage overhead and metadata traffic.}, journal={2021 IEEE INTERNATIONAL SYMPOSIUM ON PERFORMANCE ANALYSIS OF SYSTEMS AND SOFTWARE (ISPASS 2021)}, author={Yuan, Shougang and Yudha, Ardhi Wiratama Baskara and Solihin, Yan and Zhou, Huiyang}, year={2021}, pages={59–69} }