@article{abdullah_lee_zhou_awad_2024, title={Salus: Efficient Security Support for CXL-Expanded GPU Memory}, ISBN={["979-8-3503-9314-9"]}, ISSN={["1530-0897"]}, DOI={10.1109/HPCA57654.2024.00027}, abstractNote={GPUs have become indispensable accelerators for many data-intensive applications such as scientific workloads, deep learning models, and graph analytics; these applications share a common demand for increasingly large memory. As the memory capacity connected through traditional memory interfaces is reaching limits, heterogeneous memory systems have gained traction in expanding the memory pool. These systems involve dynamic data movement between different memory locations for efficient utilization, which poses challenges for existing security implementations, whose metadata are tied to the physical location of data. In this work, we propose a new security model specifically designed for systems with dynamic page migration. Our model minimizes the need for security recalculations due to data movement, optimizes security structures for efficient bandwidth utilization, and reduces the overall traffic caused by security operations. Based on our evaluation, our proposed security support improves the GPU throughput by a geometric mean of 29.94% (up to 190.43%) over the conventional security model, and it reduces the security traffic in the memory subsystem to 47.79% on average (as low as 17.71% overhead).}, journal={2024 IEEE INTERNATIONAL SYMPOSIUM ON HIGH-PERFORMANCE COMPUTER ARCHITECTURE, HPCA 2024}, author={Abdullah, Rahaf and Lee, Hyokeun and Zhou, Huiyang and Awad, Amro}, year={2024}, pages={233–248} } @article{shadab_zou_gandham_awad_lin_2024, title={A Secure Computing System With Hardware-Efficient Lazy Bonsai Merkle Tree for FPGA-Attached Embedded Memory}, volume={21}, ISSN={["1941-0018"]}, DOI={10.1109/TDSC.2023.3324935}, abstractNote={With high-impact cyber-attacks on the rise, provisioning cybersecurity to the emerging Internet of Things (IoT) systems typically comprising of modern embedded computing platforms becomes significantly more challenging to achieve. Contemporary secure-memory computing stipulates both content encryption and integrity protection that can seriously impede the computing performance and consume excessive amount of hardware resources. In this paper, we focus on hardware-efficient verification of the memory integrity in the mission-critical computing tasks executing on an FPGA-based secure embedded system, effectively mitigating adversarial attacks such as memory buffer replay. We proposed an innovative partitioned parallel cache structure that leverages the unique reconfigurable capability of modern FPGA devices and successfully circumvents the hardware implementation challenges due to the recursiveness that inherently exists in Merkle tree updating schemes. We designed and implemented a new Bonsai Merkle tree (BMT) lazy update controller specifically designed for FPGA to efficiently exploit the parallelism offered by its reconfigurable fabric. Our experimental results for the new system show up to 95x and 149x latency overhead reduction respectively for write and read and up to 17% better throughput in standard benchmarks compared to software-based approach. Critical system performance is also improved with the lowering of average evictions by up to 8%.}, number={4}, journal={IEEE TRANSACTIONS ON DEPENDABLE AND SECURE COMPUTING}, author={Shadab, Rakin Muhammad and Zou, Yu and Gandham, Sanjay and Awad, Amro and Lin, Mingjie}, year={2024}, pages={3262–3279} } @article{alam_lee_bhattacharjee_awad_2023, title={CryptoMMU: Enabling Scalable and Secure Access Control of Third-Party Accelerators}, DOI={10.1145/3613424.3614311}, abstractNote={Due to increasing energy and performance gaps between general-purpose processors and hardware accelerators (e.g., FPGA or ASIC), clear trends for leveraging accelerators arise in various fields or workloads, such as edge devices, cloud systems, and data centers. Moreover, system integrators desire higher flexibility to deploy custom accelerators based on their performance, power, and cost constraints, where such integration can be as early as (1) at the design time when third-party intellectual properties (IPs) are used, (2) at integration/upgrade time when third-party discrete chip accelerators are used, or (3) during runtime as in reconfigurable logic.A malicious third-party accelerator can compromise the entire system by accessing other processes’ data, overwriting OS data structures, etc. To eliminate these security ramifications, a unit similar to a memory management unit (MMU), namely IOMMU, is typically used to scrutinize memory accesses from I/O devices, including accelerators. Still, IOMMU incurs significant performance overhead because it resides on the critical path of each I/O memory access. In this paper, we propose a novel scheme, CryptoMMU, to delegate the translation processes to accelerators, whereas the authentication of the targeted address is elegantly performed using a cryptography-based approach. As a result, CryptoMMU facilitates the private caching of translation in each accelerator, providing better scalability. Our evaluation results show that CryptoMMU improves system throughput by an average of 2.97× and 1.13× compared to the conventional IOMMU and the state-of-the-art solution, respectively. Importantly, CryptoMMU can be implemented without any software changes.CCS CONCEPTS• Security and privacy → Security in hardware; • Hardware → Very large scale integration design.}, journal={56TH IEEE/ACM INTERNATIONAL SYMPOSIUM ON MICROARCHITECTURE, MICRO 2023}, author={Alam, Faiz and Lee, Hyokeun and Bhattacharjee, Abhishek and Awad, Amro}, year={2023}, pages={32–48} } @article{chowdhuryy_jung_yao_awad_2023, title={D-Shield: Enabling Processor-side Encryption and Integrity Verification for Secure NVMe Drives}, ISSN={["1530-0897"]}, DOI={10.1109/HPCA56546.2023.10070924}, abstractNote={Ensuring the confidentiality and integrity of data stored in storage disks is essential to protect users’ sensitive and private data. Recent developments of hardware-based attacks have motivated the need to secure storage data not only at rest but also in transit. Unfortunately, existing techniques such as software-based disk encryption and hardware-based self-encrypting disks fail to offer such comprehensive protection in today’s adversarial settings. With the advances of NVMe SSDs promising ultralow I/O latencies and high parallelism, architecting a storage subsystem that ensures the security of data storage in fast disks without adversely sacrificing their performance is critical.In this paper, we present D-Shield, a processor-side secure framework to holistically protect NVMe storage data confidentiality and integrity with low overheads. D-Shield integrates a novel DMA Interception Engine that allows the processor to perform security metadata maintenance and data protection without any modification to the NVMe protocol and NVMe disks. We further propose optimized D-Shield schemes that minimize decryption/re-encryption overheads for data transfer crossing security domains and utilize efficient in-memory caching of storage metadata to further boost system performance. We implement D-Shield prototypes and evaluate their efficacy using a set of synthetic and real-world benchmarks. Our results show that D-Shield can introduce up to 17× speedup for I/O intensive workloads compared to software-based protection schemes. For server-class database and graph applications, D-Shield achieves up to 96% higher throughput over software-based encryption and integrity checking mechanisms, while providing strong security guarantee against off-chip storage attacks. Meanwhile, D-Shield shows only 6% overhead on effective performance on real-world workloads and has modest in-storage metadata overhead and on-chip hardware cost.}, journal={2023 IEEE INTERNATIONAL SYMPOSIUM ON HIGH-PERFORMANCE COMPUTER ARCHITECTURE, HPCA}, author={Chowdhuryy, Md Hafizul Islam and Jung, Myoungsoo and Yao, Fan and Awad, Amro}, year={2023}, pages={908–921} } @article{nema_chunduru_kodigal_voskuilen_rodrigues_hemmert_feinberg_lee_awad_hughes_2023, title={ERAS: A Flexible and Scalable Framework for Seamless Integration of RTL Models with Structural Simulation Toolkit}, DOI={10.1109/IISWC59245.2023.00038}, abstractNote={The prevalence of custom Intellectual Properties (IPs) poses challenges for assessing their system-level performance and functional behavior. Register Transfer Level (RTL) simulation requires RTL-level integration with the rest of the system which is time- and resource-intensive. Similarly, developing functional and performance models of the IP requires considerable effort and expertise.This work proposes a framework, ERAS, that enables seamless integration of RTL IP models with high-level architectural simulators, such as Structural Simulation Toolkit (SST). The effectiveness of this framework is demonstrated through architectural exploration using a RISCV processor. Further, ERAS leverages SST’s multi-thread support to enhance simulation speed, effectively overcoming a key bottleneck of detailed RTL simulation. Evaluation with a dual-core RISCV-RTL configuration shows 1.83× simulation speed improvement compared to a serial simulation in gem5 as baseline.ERAS is now part of SST public repository: Link}, journal={2023 IEEE INTERNATIONAL SYMPOSIUM ON WORKLOAD CHARACTERIZATION, IISWC}, author={Nema, Shubham and Chunduru, Shiva Kaushik and Kodigal, Charan and Voskuilen, Gwendolyn and Rodrigues, Arun F. and Hemmert, Scott and Feinberg, Ben and Lee, Hyokeun and Awad, Amro and Hughes, Clayton}, year={2023}, pages={196–200} } @article{shadab_zou_gandham_awad_lin_2023, title={HMT: A Hardware-centric Hybrid Bonsai Merkle Tree Algorithm for High-performance Authentication}, volume={22}, ISSN={["1558-3465"]}, DOI={10.1145/3595179}, abstractNote={The Bonsai Merkle tree (BMT) is a widely used tree structure for authentication of metadata such as encryption counters in a secure computing system. Common BMT algorithms were designed for traditional Von Neumann architectures with a software-centric implementation in mind and as such, they are predominantly recursive and sequential in nature. However, the modern heterogeneous computing platforms employing Field-Programmable Gate Array (FPGA) devices require concurrency-focused algorithms to fully utilize the versatility and parallel nature of such systems. The recursive nature of traditional BMT algorithms makes them challenging to implement in such hardware-based setups. Our goal for this work is to introduce HMT, a hardware-friendly BMT algorithm that enables the verification and update processes to function independently and provides the benefits of relaxed update while being comparable to the eager update in terms of update complexity. The methodology of HMT contributes both novel algorithmic revisions and innovative hardware techniques to implementing BMT. We mathematically demonstrate the challenges of potentially unbounded recursions in relaxed BMT updates. To solve this problem, we use a partitioned BMT caching scheme that allocates a separate write-back cache for each BMT level—thus allowing for low and fixed upper bounds for dirty evictions compared to the traditional BMT caches. Then we introduce the aforementioned hybrid BMT algorithm that is hardware-targeted, parallel, and relaxes the update depending on BMT cache hit but makes the update conditions more flexible compared to lazy update to save additional write-backs. Deploying this new algorithm, we have designed a new BMT controller with a dataflow architecture including speculative buffers and parallel write-back engines to facilitate performance-enhancing mechanisms (like multiple concurrent authentication and independent updates) that were not possible with the conventional lazy algorithm. Our empirical performance measurements on a Xilinx U200 accelerator FPGA have demonstrated that HMT can achieve up to 7× improvement in bandwidth and 4.5× reduction in latency over lazy-update BMT baseline and up to 14% faster execution in standard benchmarks compared to a state-of-the-art, eager-update BMT solution.}, number={4}, journal={ACM TRANSACTIONS ON EMBEDDED COMPUTING SYSTEMS}, author={Shadab, Rakin Muhammad and Zou, Yu and Gandham, Sanjay and Awad, Amro and Lin, Mingjie}, year={2023}, month={Jul} } @article{abdullah_zhou_awad_2023, title={Plutus: Bandwidth-Efficient Memory Security for GPUs}, ISSN={["1530-0897"]}, DOI={10.1109/HPCA56546.2023.10071100}, abstractNote={Graphic-Processing Units (GPUs) are increasingly used in systems where security is a critical design requirement. Such systems include cloud computing, safety-critical systems, and edge devices, where sensitive data is processed or/and generated. Thus, the ability to reduce the attack surface while achieving high performance is of utmost importance. However, adding security features to GPUs comes at the expense of high-performance overheads due to the extra memory bandwidth required to handle security metadata. In particular, memory authentication metadata (e.g., authentication tags) along with encryption counters can lead to significant performance overheads due to the memory bandwidth used to fetch the metadata. Such metadata can lead to more than 200% extra bandwidth usage for irregular access patterns.In this work, we propose a novel design, Plutus, which enables low-overhead secure GPU memory. Plutus has three key ideas. The first is to leverage value locality to reduce authentication metadata. Our observation is that a large percentage of memory accesses could be verified without the need to bring the authentication tags. Specifically, through comparing decrypted blocks against known/verified values, we can with high confidence guarantee that no tampering occurred. Our analysis shows that the probability of the decryption of a tampered (and/or replayed) block leading to a known value is extremely low, in fact, lower than the collision probability in the most secure hash functions. Second, based on the observation that many GPU workloads have limited numbers of dirty block evictions, Plutus proposes a second layer of compact counters to reduce the memory traffic due to both the encryption counters and integrity tree. Third, by exploring the interesting tradeoff between the integrity tree organization vs. metadata fetch granularity, Plutus uses smaller block sizes for security metadata caches to optimize the number of security metadata memory requests. Based on our evaluation, Plutus can improve the GPU throughput by 16.86% (up to 58.38%) and reduce the memory bandwidth usage of secure memory by 48.14% (up to 80.30%).}, journal={2023 IEEE INTERNATIONAL SYMPOSIUM ON HIGH-PERFORMANCE COMPUTER ARCHITECTURE, HPCA}, author={Abdullah, Rahaf and Zhou, Huiyang and Awad, Amro}, year={2023}, pages={543–555} } @article{abu zubair_abdullah_mohaisen_awad_2024, title={RC-NVM: Recovery-Aware Reliability-Security Co-Design for Non-Volatile Memories}, volume={21}, ISSN={["1941-0018"]}, DOI={10.1109/TDSC.2023.3279031}, abstractNote={Non-Volatile Memory (NVM) technologies are now available in the form of byte-addressable and fast main memory. Despite their benefits, such memories require secure and reliable memory management to prevent malicious and spontaneous data alteration. However, in NVM security, it is still a major challenge to maintain crash consistency and reliable system recovery. In particular, Message Authentication Codes (MAC) are rarely discussed in recent recovery-aware NVM studies since they are generally not cached. MACs have outstanding sensitivity to memory errors and hence they can be used for reliability enhancement alongside their mainstream use to detect malicious tampering. However, persisting MACs is challenging and requires 2x writes and reads in a conventional secure NVM system. It is possible to cache MACs in a MAC-assisted reliability scheme; however, this brings many challenges related to crash consistency and reliability. In this paper, we present the difficulties associated with MAC recovery if they are cached, and solutions to guarantee reliable system recovery. Finally, we propose a novel scheme, R ecoverable and C hipkill capable NVM , RC-NVM, which can effectively use a volatile write-back cache for MACs as well as recover them quickly after a system crash. Our scheme reduces 27% of the writes and allows 18.2% performance improvement compared to the state-of-the-art, while preserving the ability to recover from a system crash.}, number={4}, journal={IEEE TRANSACTIONS ON DEPENDABLE AND SECURE COMPUTING}, author={Abu Zubair, Kazi and Abdullah, Rahaf and Mohaisen, David and Awad, Amro}, year={2024}, pages={1817–1830} } @article{karabulut_awad_aysu_2023, title={SS-AXI: Secure and Safe Access Control Mechanism for Multi-Tenant Cloud FPGAs}, ISSN={["0271-4302"]}, DOI={10.1109/ISCAS46773.2023.10181609}, abstractNote={FPGAs are newly added to the cloud to offer energy-efficient acceleration. Multi-tenancy is an emerging phenomenon in cloud FPGAs to enable resource efficiency. In a multi-tenant scenario, multiple users can share the same FPGA fabric either spatially (i.e., tenants share different resources at the same time) or temporally (tenants share the same resources in different time slots). Undesired access or manipulation of other tenant's data can cause security and safety issues. Although safety/security concepts in access control policies have been thoroughly studied in conventional cloud systems, they are relatively unknown for cloud FPGAs. Moreover, these concepts may not trivially extend to cloud FPGAs due to their different nature. This paper proposes an improved access control mechanism for multi-tenant cloud FPGAs. Compared to existing commercial tools, our solution allows dynamic configuration of access control privileges. Compared to earlier academic proposals with dynamic configuration, the results show that our proposal has three advantages: (i) enabling secure resource sharing of on-chip BRAMs to tenants, (ii) enabling safe sharing by resolving deadlocks and faulty access requests, and (iii) improvement in latency and throughput.}, journal={2023 IEEE INTERNATIONAL SYMPOSIUM ON CIRCUITS AND SYSTEMS, ISCAS}, author={Karabulut, Emre and Awad, Amro and Aysu, Aydin}, year={2023} } @article{han_tuck_awad_2023, title={Thoth: Bridging the Gap Between Persistently Secure Memories and Memory Interfaces of Emerging NVMs}, ISSN={["1530-0897"]}, DOI={10.1109/HPCA56546.2023.10070991}, abstractNote={Emerging non-volatile memories (NVMs) are expected to be part of future computing systems, including cloud systems and edge devices. In addition to the high density (and hence large capacities) NVMs can provide, they feature ultra-low idle power which makes them very promising for edge computing and data centers. Additionally, NVMs’ ability to retain data upon system crash (e.g., power outage or software bug) makes them a great candidate for high-availability and persistent applications. However, NVMs’ data retention capability brings in security challenges and further complicates today’s secure memory implementations; to ensure correct and secure system recovery, the data and security metadata must be persisted atomically (i.e., up-to-date in memory upon a crash).Despite the many efforts for rethinking secure memory implementations to enable crash-consistency, we observe that the state-of-the-art solutions are based on a major assumption that may not be suitable for future memory interfaces. Specifically, the majority of today’s solutions assume that either the encryption counter and/or message-authentication code (MAC) can be co-located with data by directly or indirectly leveraging the otherwise Error-Correcting Codes (ECC) bits. However, we observe that emerging interfaces and standards delegate the ECC calculation and management to happen inside the memory module, which makes it possible to remove extra bits for ECC in memory interfaces. Thus, all today’s solutions may need to separately persist the encrypted data, its MAC, and its encryption counter upon each memory write. To mitigate this issue, we propose a novel solution, Thoth, which leverages a novel off-chip persistent partial updates combine buffer that can ensure crash consistency at the cost of a fraction of the write amplification by the state-of-the-art solutions when adapted to future interfaces. Based on our evaluation, Thoth improves the performance by an average of 1.22x (up to 1.44x) while reducing write traffic by an average of 32% (up to 40%) compared to the baseline Anubis when adapted to future interfaces.}, journal={2023 IEEE INTERNATIONAL SYMPOSIUM ON HIGH-PERFORMANCE COMPUTER ARCHITECTURE, HPCA}, author={Han, Xijing and Tuck, James and Awad, Amro}, year={2023}, pages={94–107} } @article{meteriz-yildiran_yildiran_awad_mohaisen_2022, title={A Keylogging Inference Attack on Air-Tapping Keyboards in Virtual Environments}, DOI={10.1109/VR51125.2022.00098}, abstractNote={Enabling users to push the physical world’s limits, augmented and virtual reality platforms opened a new chapter in perception. Novel immersive experiences resulted in the emergence of new interaction methods for virtual environments, which came with unprecedented security and privacy risks. This paper presents a keylogging inference attack to infer user inputs typed with in-air tapping keyboards. We observe that hands follow specific patterns when typing in the air and exploit this observation to carry out our attack. Starting with three plausible attack scenarios where the adversary obtains the hand trace patterns of the victim, we build a pipeline to reconstruct the user input. Our attack pipeline takes the hand traces of the victim as an input and outputs a set of input inferences ordered from the best to worst. Through various experiments, we showed that our inference attack achieves a pinpoint accuracy ranging from 40% to 87% within at most the top-500 candidate reconstructions. Finally, we discuss countermeasures, while the results presented provide a cautionary tale of the security and privacy risk of the immersive mobile technology.}, journal={2022 IEEE CONFERENCE ON VIRTUAL REALITY AND 3D USER INTERFACES (VR 2022)}, author={Meteriz-Yildiran, Ulku and Yildiran, Necip Fazil and Awad, Amro and Mohaisen, David}, year={2022}, pages={765–774} } @article{zou_abu zubair_alwadi_shadab_gandham_awad_lin_2022, title={ARES: Persistently Secure Non-Volatile Memory with Processor-transparent and Hardware-friendly Integrity Verification and Metadata Recovery}, volume={21}, ISSN={["1558-3465"]}, DOI={10.1145/3492735}, abstractNote={Emerging byte-addressable Non-Volatile Memory (NVM) technology, although promising superior memory density and ultra-low energy consumption, poses unique challenges to achieving persistent data privacy and computing security, both of which are critically important to the embedded and IoT applications. Specifically, to successfully restore NVMs to their working states after unexpected system crashes or power failure, maintaining and recovering all the necessary security-related metadata can severely increase memory traffic, degrade runtime performance, exacerbate write endurance problem, and demand costly hardware changes to off-the-shelf processors. In this article, we designed and implemented ARES, a new FPGA-assisted processor-transparent security mechanism that aims at efficiently and effectively achieving all three aspects of a security triad—confidentiality, integrity, and recoverability—in modern embedded computing. Given the growing prominence of CPU-FPGA heterogeneous computing architectures, ARES leverages FPGA’s hardware reconfigurability to offload performance-critical and security-related functions to the programmable hardware without microprocessors’ involvement. In particular, recognizing that the traditional Merkle tree caching scheme cannot fully exploit FPGA’s parallelism due to its sequential and recursive function calls, we (1) proposed a Merkle tree cache architecture that partitions a unified cache into multiple levels with parallel accesses and (2) further designed a novel Merkle tree scheme that flattened and reorganized the computation in the traditional Merkle tree verification and update processes to fully exploit the parallel cache ports and to fully pipeline time-consuming hashing operations. Beyond that, to accelerate the metadata recovery process, multiple parallel recovery units are instantiated to recover counter metadata and multiple Merkle sub-trees. Our hardware prototype of the ARES system on a Xilinx U200 platform shows that ARES achieved up to 1.4× lower latency and 2.6× higher throughput against the baseline implementation, while metadata recovery time was shortened by 1.8 times. When integrated with an embedded processor, neither hardware changes nor software changes are required. We also developed a theoretical framework to analytically model and explain experimental results.}, number={1}, journal={ACM TRANSACTIONS ON EMBEDDED COMPUTING SYSTEMS}, author={Zou, Yu and Abu Zubair, Kazi and Alwadi, Mazen and Shadab, Rakin Muhammad and Gandham, Sanjay and Awad, Amro and Lin, Mingjie}, year={2022}, month={Feb} } @article{plagge_feinberg_mcfarland_rothganger_agarwal_awad_hughes_cardwell_2022, title={ATHENA: Enabling Codesign for Next-Generation AI/ML Architectures}, DOI={10.1109/ICRC57508.2022.00016}, abstractNote={There is a growing market for technologies ded-icated to accelerating Artificial Intelligence (AI) workloads. Many of these emerging architectures promise to provide savings in energy efficiency, area, and latency when compared to traditional CPUs for these types of applications. In particular, neuromorphic analog and digital technologies provide both low-power and configurable acceleration of challenging artificial intelligence (AI) algorithms. If designed into a heterogeneous system with other accelerators and conventional compute nodes, these technologies have the potential to augment the capabilities of traditional High Performance Computing (HPC) platforms. We present a codesign ecosystem that leverages an analytical tool, ATHENA, to accelerate design space exploration and evaluation of novel architectures.}, journal={2022 IEEE INTERNATIONAL CONFERENCE ON REBOOTING COMPUTING, ICRC}, author={Plagge, Mark and Feinberg, Ben and McFarland, John and Rothganger, Fred and Agarwal, Sapan and Awad, Amro and Hughes, Clayton and Cardwell, Suma G.}, year={2022}, pages={13–23} } @article{yuan_awad_yudha_solihin_zhou_2022, title={Adaptive Security Support for Heterogeneous Memory on GPUs}, ISSN={["1530-0897"]}, DOI={10.1109/HPCA53966.2022.00024}, abstractNote={The wide use of accelerators such as GPUs necessities their security support Recent works [17], [33], [34] pointed out that directly adopting the CPU secure memory design to GPUs could incur significant performance overheads due to the memory bandwidth contention between regular data and security metadata. In this paper, we analyze the security guarantees that used to defend against physical attacks, and make the observation that heterogeneous GPU memory system may not always need all the security mechanisms to achieve the security guarantees. Based on the memory types as well as memory access patterns either explicitly specified in the GPU programming model or implicitly detected at run time, we propose adaptive security memory support for heterogeneous memory on GPUs. Specifically, we first identify the read-only data and propose to only use MAC (Message Authentication Code) to protect their integrity. By eliminating the freshness checks on read-only data, we can use an on-chip shared counter for such data regions and remove the corresponding parts in the Bonsai Merkel Tree (BMT), thereby reducing the traffic due to encryption counters and the BMT. Second, we detect the common streaming data access pattern and propose coarse- grain MACs for such stream data to reduce the MAC access bandwidth. With the hardware-based detection of memory type (read-only or not) and memory access patterns (streaming or not), our proposed approach adapts the security support to significantly reduce the performance overhead without sacrificing the security guarantees. Our evaluation shows that our scheme can achieve secure memory on GPUs with low overheads for memory-intensive workloads. Among the fifteen memory-intensive workloads in our evaluation, our design reduces the performance overheads of secure GPU memory from 53.9% to 8.09% on average. Compared to the state-of- the-art secure memory designs for GPU [17], [33], our scheme outperforms PSSM by up to 41.63% and 9.5% on average and outperforms Common counters by 84.04% on average for memory-intensive workloads. We further propose to use the L2 cache as a victim cache for security metadata when the L2 is either underutilized or suffers from very high miss rates, which further reduces the overheads by up to 4% and 0.65% on average.}, journal={2022 IEEE INTERNATIONAL SYMPOSIUM ON HIGH-PERFORMANCE COMPUTER ARCHITECTURE (HPCA 2022)}, author={Yuan, Shougang and Awad, Amro and Yudha, Ardhi Wiratama Baskara and Solihin, Yan and Zhou, Huiyang}, year={2022}, pages={213–228} } @article{zou_awad_lin_2022, title={DirectNVM: Hardware-accelerated NVMe SSDs for High-performance Embedded Computing}, volume={21}, ISSN={["1558-3465"]}, DOI={10.1145/3463911}, abstractNote={With data-intensive artificial intelligence (AI) and machine learning (ML) applications rapidly surging, modern high-performance embedded systems, with heterogeneous computing resources, critically demand low-latency and high-bandwidth data communication. As such, the newly emerging NVMe (Non-Volatile Memory Express) protocol, with parallel queuing, access prioritization, and optimized I/O arbitration, starts to be widely adopted as a de facto fast I/O communication interface. However, effectively leveraging the potential of modern NVMe storage proves to be nontrivial and demands fine-grained control, high processing concurrency, and application-specific optimization. Fortunately, modern FPGA devices, capable of efficient parallel processing and application-specific programmability, readily meet the underlying physical layer requirements of the NVMe protocol, therefore providing unprecedented opportunities to implementing a rich-featured NVMe middleware to benefit modern high-performance embedded computing. In this article, we present how to rethink existing accessing mechanisms of NVMe storage and devise innovative hardware-assisted solutions to accelerating NVMe data access performance for the high-performance embedded computing system. Our key idea is to exploit the massively parallel I/O queuing capability, provided by the NVMe storage system, through leveraging FPGAs’ reconfigurability and native hardware computing power to operate transparently to the main processor. Specifically, our DirectNVM system aims at providing effective hardware constructs for facilitating high-performance and scalable userspace storage applications through (1) hardening all the essential NVMe driver functionalities, therefore avoiding expensive OS syscalls and enabling zero-copy data access from the application, (2) relying on hardware for the I/O communication control instead of relying on OS-level interrupts that can significantly reduce both total I/O latency and its variance, and (3) exposing cutting-edge and application-specific weighted-round-robin I/O traffic scheduling to the userspace. To validate our design methodology, we developed a complete DirectNVM system utilizing the Xilinx Zynq MPSoC architecture that incorporates a high-performance application processor (APU) equipped with DDR4 system memory and a hardened configurable PCIe Gen3 block in its programmable logic part. We then measured the storage bandwidth and I/O latency of both our DirectNVM system and a conventional OS-based system when executing the standard FIO benchmark suite [ 2 ]. Specifically, compared against the PetaLinux built-in kernel driver code running on a Zynq MPSoC, our DirectNVM has shown to achieve up to 18.4× higher throughput and up to 4.5× lower latency. To ensure the fairness of our performance comparison, we also measured our DirectNVM system against the Intel SPDK [ 26 ], a highly optimized userspace asynchronous NVMe I/O framework running on a X86 PC system. Our experiment results have shown that our DirectNVM, even running on a considerably less powerful embedded ARM processor than a full-scale AMD processor, achieved up to 2.2× higher throughput and 1.3× lower latency. Furthermore, by experimenting with a multi-threading test case, we have demonstrated that our DirectNVM’s weighted-round-robin scheduling can significantly optimize the bandwidth allocation between latency-constraint frontend applications and other backend applications in real-time systems. Finally, we have developed a theoretical framework of performance modeling with classic queuing theory that can quantitatively define the relationship between a system’s I/O performance and its I/O implementation. }, number={1}, journal={ACM TRANSACTIONS ON EMBEDDED COMPUTING SYSTEMS}, author={Zou, Yu and Awad, Amro and Lin, Mingjie}, year={2022}, month={Feb} } @article{nema_kirschner_adak_agarwal_feinberg_rodrigues_marinella_awad_2022, title={Eris: Fault Injection and Tracking Framework for Reliability Analysis of Open-Source Hardware}, DOI={10.1109/ISPASS55109.2022.00027}, abstractNote={As transistors have been scaled over the past decade, modern systems have become increasingly susceptible to faults. Increased transistor densities and lower capacitances make a particle strike more likely to cause an upset. At the same time, complex computer systems are increasingly integrated into safety-critical systems such as autonomous vehicles. These two trends make the study of system reliability and fault tolerance essential for modern systems. To analyze and improve system reliability early in the design process, new tools are needed for RTL fault analysis.This paper proposes Eris, a novel framework to identify vulnerable components in hardware designs through fault-injection and fault propagation tracking. Eris builds on ESSENT—a fast C/C++ RTL simulation framework—to provide fault injection, fault tracking, and control-flow deviation detection capabilities for RTL designs. To demonstrate Eris’ capabilities, we analyze the reliability of the open source Rocket Chip SoC by randomly injecting faults during thousands of runs on four microbenchmarks. As part of this analysis we measure the sensitivity of different hardware structures to faults based on the likelihood of a random fault causing silent data corruption, unrecoverable data errors, program crashes, and program hangs. We detect control flow deviations and determine whether or not they are benign. Additionally, using Eris’ novel fault-tracking capabilities we are able to find 78% more vulnerable components in the same number of simulations compared to RTL-based fault injection techniques without these capabilities. We will release Eris as an open-source tool to aid future research into processor reliability and hardening.}, journal={2022 IEEE INTERNATIONAL SYMPOSIUM ON PERFORMANCE ANALYSIS OF SYSTEMS AND SOFTWARE (ISPASS 2022)}, author={Nema, Shubham and Kirschner, Justin and Adak, Debpratim and Agarwal, Sapan and Feinberg, Ben and Rodrigues, Arun F. and Marinella, Matthew J. and Awad, Amro}, year={2022}, pages={210–220} } @article{abu zubair_mohaisen_awad_2022, title={Filesystem Encryption or Direct-Access for NVM Filesystems? Let's Have Both!}, ISSN={["1530-0897"]}, DOI={10.1109/HPCA53966.2022.00043}, abstractNote={Emerging Non-Volatile Memories (NVMs) are promising candidates to build ultra-low idle power memory and storage devices in future computing systems. Unlike DRAM, NVMs do not require frequent refresh operations, and they can retain data after crashes and power loss. With such features, NVM memory modules can be used partly as a conventional memory to host memory pages and partly as file storage to host filesystems and persistent data. Most importantly, and unlike current storage technologies, NVMs can be directly attached to the memory bus and accessed through conventional load/store operations.As NVMs feature ultra-low access latency, it is necessary to minimize software overheads for accessing files to enable the full potential. In legacy storage devices, e.g., Flash and Harddisk drives, access latency dominates the software overheads. However, emerging NVMs’ performance can be burdened by the software overheads since memory access latency is minimal. Modern Operating Systems (OSes) allow direct-access (DAX) for NVM-hosted files through direct load/store operations by eliminating intermediate software layers. Unfortunately, we observe that such a direction ignores filesystem encryption and renders most of the current filesystem encryption implementations inapplicable to future NVM systems. In this paper, we propose a novel hardware/software co-design architecture that enables transparent filesystem encryption without sacrificing the direct-access feature of files in emerging NVMs with minimal change in OS and memory controller. Our proposed model incurs a negligible overall slowdown of 3.8% for workloads representative of real-world applications, while software-based encryption can incur as high as 5x slowdown for some applications.}, journal={2022 IEEE INTERNATIONAL SYMPOSIUM ON HIGH-PERFORMANCE COMPUTER ARCHITECTURE (HPCA 2022)}, author={Abu Zubair, Kazi and Mohaisen, David and Awad, Amro}, year={2022}, pages={490–502} } @article{dubey_karabulut_awad_aysu_2022, title={High-Fidelity Model Extraction Attacks via Remote Power Monitors}, DOI={10.1109/AICAS54282.2022.9869973}, abstractNote={This paper shows the first side-channel attack on neural network (NN) IPs through a remote power monitor. We demonstrate that a remote monitor implemented with time-to-digital converters can be exploited to steal the weights from a hardware implementation of NN inference. Such an attack alleviates the need to have physical access to the target device and thus expands the attack vector to multi-tenant cloud FPGA platforms. Our results quantify the effectiveness of the attack on an FPGA implementation of NN inference and compare it to an attack with physical access. We demonstrate that it is indeed possible to extract the weights using DPA with 25000 traces if the SNR is sufficient. The paper, therefore, motivates secure virtualization-to protect the confidentiality of high-valued NN model IPs in multi-tenant execution environments, platform developers need to employ strong countermeasures against physical side-channel attacks.}, journal={2022 IEEE INTERNATIONAL CONFERENCE ON ARTIFICIAL INTELLIGENCE CIRCUITS AND SYSTEMS (AICAS 2022): INTELLIGENT TECHNOLOGY IN THE POST-PANDEMIC ERA}, author={Dubey, Anuj and Karabulut, Emre and Awad, Amro and Aysu, Aydin}, year={2022}, pages={328–331} } @article{han_tuck_awad_2022, title={Horus: Persistent Security for Extended Persistence-Domain Memory Systems}, ISSN={["1072-4451"]}, DOI={10.1109/MICRO56248.2022.00087}, abstractNote={Persistent memory presents a great opportunity for crash-consistent computing in large-scale computing systems. The ability to recover data upon power outage or crash events can significantly improve the availability of large-scale systems, while improving the performance of persistent data applications (e.g., database applications). However, persistent memory suffers from high write latency and requires specific programming model (e.g., Intel’s PMDK) to guarantee crash consistency, which results in long latency to persist data. To mitigate these problems, recent standards advocate for sufficient back-up power that can flush the whole cache hierarchy to the persistent memory upon detection of an outage, i.e., extending the persistence domain to include the cache hierarchy. In the secure NVM with extended persistent domain(EPD), in addition to flushing the cache hierarchy, extra actions need to be taken to protect the flushed cache data. These extra actions of secure operation could cause significant burden on energy costs and battery size. We demonstrate that naive implementations could lead to significantly expanding the required power holdup budget (e.g., 10.3x more operations than EPD system without secure memory support). The significant overhead is caused by memory accesses of secure metadata. In this paper, we present Horus, a novel EPD-aware secure memory implementation. Horus reduces the overhead during draining period of EPD system by reducing memory accesses of secure metadata. Experiment result shows that Horus reduces the draining time by 5x, compared with the naive baseline design.}, journal={2022 55TH ANNUAL IEEE/ACM INTERNATIONAL SYMPOSIUM ON MICROARCHITECTURE (MICRO)}, author={Han, Xijing and Tuck, James and Awad, Amro}, year={2022}, pages={1255–1269} } @article{alwadi_wang_mohaisen_hughes_hammond_awad_2022, title={Minerva: Rethinking Secure Architectures for the Era of Fabric-Attached Memory Architectures}, ISSN={["1530-2075"]}, DOI={10.1109/IPDPS53621.2022.00033}, abstractNote={Fabric-attached memory (FAM) is proposed to enable the seamless integration of directly accessible memory modules attached to the shared system fabric, which will provide future systems with flexible memory integration options, mitigate underutilization, and facilitate data sharing. Recently proposed interconnects, such as Gen-Z and Compute Express Link (CXL), define security, correctness, and performance requirements of fabric-attached devices, including memory. These initiatives are supported by most major system and processor vendors, bringing widespread adoption of FAM-enabled systems one step closer to reality and security concerns to the forefront. This paper discusses the challenges for adapting secure memory implementations to FAM-enabled systems for the first time in literature. Specifically, we observe that handling the security metadata used to protect fabric-attached memories needs to be done deliberately to eliminate unintentional integrity check failures and/or security vulnerabilities, caused by an inconsistent view of the shared security metadata across nodes. Our scheme, Minerva, elegantly adapts secure memory implementations to support FAM-enabled systems with negligible performance over-heads (3.8% of an ideal scheme), compared to the performance overhead (99.5% of an ideal scheme) for a scheme that uses conventional invalidation-based cache coherence to ensure the consistency of security metadata across nodes.}, journal={2022 IEEE 36TH INTERNATIONAL PARALLEL AND DISTRIBUTED PROCESSING SYMPOSIUM (IPDPS 2022)}, author={Alwadi, Mazen and Wang, Rujia and Mohaisen, David and Hughes, Clayton and Hammond, Simon David and Awad, Amro}, year={2022}, pages={258–268} } @article{rashed_awad_jha_ewetz_2022, title={Towards Resilient Analog In-Memory Deep Learning via Data Layout Re-Organization}, DOI={10.1145/3489517.3530532}, abstractNote={Processing in-memory paves the way for neural network inference engines. An arising challenge is to develop the software/hardware interface to automatically compile deep learning models onto in-memory computing platforms. In this paper, we observe that the data layout organization of a deep neural network (DNN) model directly impacts the model's classification accuracy. This stems from that the resistive parasitics within a crossbar introduces a dependency between the matrix data and the precision of the analog computation. To minimize the impact of the parasitics, we first perform a case study to understand the underlying matrix properties that result in computation with low and high precision, respectively. Next, we propose the XORG framework that performs data layout organization for DNNs deployed on in-memory computing platforms. The data layout organization improves precision by optimizing the weight matrix to crossbar assignments at compile time. The experimental results show that the XORG framework improves precision with up to 3.2X and 31% on the average. When accelerating DNNs using XORG, the write bit-accuracy requirements are relaxed with 1-bit and the robustness to random telegraph noise (RTN) is improved.}, journal={PROCEEDINGS OF THE 59TH ACM/IEEE DESIGN AUTOMATION CONFERENCE, DAC 2022}, author={Rashed, Muhammad Rashedul Haq and Awad, Amro and Jha, Sumit Kumar and Ewetz, Rickard}, year={2022}, pages={859–864} } @article{mcfarland_awad_2022, title={Transpose-Xen: Virtualized Mixed-Criticality through Dynamic Allocation}, DOI={10.1145/3477314.3506979}, abstractNote={Cloud systems continue to rise in popularity due to their ability to provide access to flexible, scalable systems to be shared among all their users. Several tasks can be executed simultaneously within a server, but have varying requirements for completion. While some jobs may have latency-critical quality-of-service (QoS) requirements, others may have stricter real-time constraints for maximum deadline misses. The introduction of hard real-time tasks, where zero deadline misses are acceptable, results in scheduling concurrent jobs becoming increasingly difficult. In this paper we propose Transpose-Xen, an adaptive hyper-visor scheduler capable of managing the scheduling of tasks of varying levels of criticality, including tasks with hard real-time constraints. Transpose-Xen is able to execute multiple jobs of varying criticality by finding similarities between the resource needs of each task despite potentially executing from separate VMs. Once grouped into these resource sub-pools, or ponds, our scheduler allocates the needed resources to ensure that each job is schedulable if possible. Transpose-Xen also leverages the use of virtual-deadlines, a scheduling algorithm that we use to prioritize higher-criticality tasks without completely starving lower-criticality tasks of resources. By profiling the impact of resource allocation on real-time tasks, different jobs of varying criticality levels can be scheduled concurrently - capable of satisfying both hard real-time constraints and satisfying up to 99% of all soft real-time deadlines.}, journal={37TH ANNUAL ACM SYMPOSIUM ON APPLIED COMPUTING}, author={McFarland, John and Awad, Amro}, year={2022}, pages={3–12} } @article{choi_anwar_alabduljabbar_alasmary_spaulding_wang_chen_nyang_awad_mohaisen_2022, title={Understanding Internet of Things malware by analyzing endpoints in their static artifacts}, volume={206}, ISSN={["1872-7069"]}, DOI={10.1016/j.comnet.2022.108768}, abstractNote={The lack of security measures among the Internet of Things (IoT) devices and their persistent online connection gives adversaries a prime opportunity to target them or even abuse them as intermediary targets in larger attacks such as distributed denial-of-service (DDoS) campaigns. In this paper, we analyze IoT malware and focus on the endpoints reachable on the public Internet, that play an essential part in the IoT malware ecosystem. Namely, we analyze endpoints acting as dropzones and their targets to gain insights into the underlying dynamics in this ecosystem, such as the affinity between the dropzones and their target IP addresses, and the different patterns among endpoints. Towards this goal, we reverse-engineer 2423 IoT malware samples and extract strings from them to obtain IP addresses. We further gather information about these endpoints from public Internet-wide scanners, such as Shodan and Censys. Our results, through analysis and visualization expose clear patterns of affinity between sources and targets of attacks, attack exposure by Internet infrastructure, and clear depiction of the ecosystem of IoT malware as a whole, only utilizing static artifacts. Our investigation from four different perspectives provides profound insights into the role of endpoints in IoT malware attacks, which deepens our understanding of IoT malware ecosystems and can assist future defenses.}, journal={COMPUTER NETWORKS}, author={Choi, Jinchun and Anwar, Afsah and Alabduljabbar, Abdulrahman and Alasmary, Hisham and Spaulding, Jeffrey and Wang, An and Chen, Songqing and Nyang, DaeHun and Awad, Amro and Mohaisen, David}, year={2022}, month={Apr} } @article{kommareddy_hughes_hammond_awad_2021, title={DeACT: Architecture-Aware Virtual Memory Support for Fabric Attached Memory Systems}, ISSN={["1530-0897"]}, DOI={10.1109/HPCA51647.2021.00046}, abstractNote={1 The exponential growth of data has driven technology providers to develop new protocols, such as cache coherent interconnects and memory semantic fabrics, to help users and facilities leverage advances in memory technologies to satisfy these growing memory and storage demands. Using these new protocols, fabric-attached memories (FAM) can be directly attached to a system interconnect and be easily integrated with a variety of processing elements (PEs). Moreover, systems that support FAM can be smoothly upgraded and allow multiple PEs to share the FAM memory pools using well-defined protocols. The sharing of FAM between PEs allows efficient data sharing, improves memory utilization, reduces cost by allowing flexible integration of different PEs and memory modules from several vendors, and makes it easier to upgrade the system. One promising use-case for FAMs is in High-Performance Compute (HPC) systems, where the underutilization of memory is a major challenge. However, adopting FAMs in HPC systems brings new challenges. In addition to cost, flexibility, and efficiency, one particular problem that requires rethinking is virtual memory support for security and performance. To address these challenges, this paper presents decoupled access control and address translation (DeACT), a novel virtual memory implementation that supports HPC systems equipped with FAM. Compared to the state-of-the-art two-level translation approach, DeACT achieves speedup of up to 4.59x (1.8x on average) without compromising security.1Part of this work was done when Vamsee was working under the supervision of Amro Awad at UCF. Amro Awad is now with the ECE Department at NC State.}, journal={2021 27TH IEEE INTERNATIONAL SYMPOSIUM ON HIGH-PERFORMANCE COMPUTER ARCHITECTURE (HPCA 2021)}, author={Kommareddy, Vamsee Reddy and Hughes, Clayton and Hammond, Simon David and Awad, Amro}, year={2021}, pages={453–466} } @article{zou_awad_lin_2021, title={HERMES: Hardware-Efficient Speculative Dataflow Architecture for Bonsai Merkle Tree-Based Memory Authentication}, DOI={10.1109/HOST49136.2021.9702283}, abstractNote={Emerging byte-addressable Non-Volatile Memory (NVM) technology, although promising superior memory density and ultra-low energy consumption, poses unique challenges to guaranteeing memory confidentiality, integrity, and crash-consistency. As such, extensive research has been conducted to transparently protect memory security through an FPGA-implemented middleware that effectively deploys encryption, authentication/integrity verification, and replay attack protection. Bonsai Merkle tree (BMT) has been proven to be highly effective in guaranteeing memory integrity and protecting against replay attack. However, when used in a strictly persistent trusted execution environment (TEE), BMT-based memory integrity protection severely bottlenecks memory performance because properly maintaining a BMT results in a deep traversal over the hash tree for every counter update. In this paper, we propose HERMES, a hardware-efficient memory integrity engine specifically designed to deliver a crash-consistent BMT for NVM, capable of processing multiple outstanding counter requests in flight, which significantly improves both latency and throughput of all BMT operations through leveraging an asynchronous dataflow architecture and speculative execution. HERMES incorporates three architectural innovations: (1) a speculative control logic and a speculative temporary buffer dedicatedly designed and deployed at each level; (2) an optimized hardware component verifying all BMT levels in parallel and an adaptive algorithm adapting to caching status of BMT levels; (3) a formalized message format transferred between BMT levels to accommodate both counter operations within a unified architecture where each level is able to adaptively behave. Experimented with Shuhai memory bandwidth benchmark, HERMES achieved up to 7.9x higher throughput and up to 3.5x shorter latency over the state-of-the-art ARES while consuming 2x resource utilization as a tradeoff.}, journal={2021 IEEE INTERNATIONAL SYMPOSIUM ON HARDWARE ORIENTED SECURITY AND TRUST (HOST)}, author={Zou, Yu and Awad, Amro and Lin, Mingjie}, year={2021}, pages={203–213} } @article{wang_zhao_hou_awad_tian_meng_2021, title={NASGuard: A Novel Accelerator Architecture for Robust Neural Architecture Search (NAS) Networks}, ISSN={["1063-6897"]}, DOI={10.1109/ISCA52012.2021.00066}, abstractNote={Due to the wide deployment of deep learning applications in safety-critical systems, robust and secure execution of deep learning workloads is imperative. Adversarial examples, where the inputs are carefully designed to mislead the machine learning model is among the most challenging attacks to detect and defeat. The most dominant approach for defending against adversarial examples is to systematically create a network architecture that is sufficiently robust. Neural Architecture Search (NAS) has been heavily used as the de facto approach to design robust neural network models, by using the accuracy of detecting adversarial examples as a key metric of the neural network’s robustness. While NAS has been proven effective in improving the robustness (and accuracy in general), the NAS-generated network models run noticeably slower on typical DNN accelerators than the hand-crafted networks, mainly because DNN accelerators are not optimized for robust NAS-generated models. In particular, the inherent multi-branch nature of NAS-generated networks causes unacceptable performance and energy overheads.To bridge the gap between the robustness and performance efficiency of deep learning applications, we need to rethink the design of AI accelerators to enable efficient execution of robust (auto-generated) neural networks. In this paper, we propose a novel hardware architecture, NASGuard, which enables efficient inference of robust NAS networks. NASGuard leverages a heuristic multi-branch mapping model to improve the efficiency of the underlying computing resources. Moreover, NASGuard addresses the load imbalance problem between the computation and memory-access tasks from multi-branch parallel computing. Finally, we propose a topology-aware performance prediction model for data prefetching, to fully exploit the temporal and spatial localities of robust NAS-generated architectures. We have implemented NASGuard with Verilog RTL. The evaluation results show that NASGuard achieves an average speedup of 1.74× over the baseline DNN accelerator.}, journal={2021 ACM/IEEE 48TH ANNUAL INTERNATIONAL SYMPOSIUM ON COMPUTER ARCHITECTURE (ISCA 2021)}, author={Wang, Xingbin and Zhao, Boyan and Hou, Rui and Awad, Amro and Tian, Zhihong and Meng, Dan}, year={2021}, pages={776–789} } @article{yuan_wang_hou_li_li_zhao_ying_awad_meng_2021, title={PiPoMonitor: Mitigating Cross-core Cache Attacks Using the Auto-Cuckoo Filter}, DOI={10.23919/DATE51398.2021.9473988}, abstractNote={Cache side channel attacks obtain victim cache line access footprint to infer security-critical information. Among them, cross-core attacks exploiting the shared last level cache are more threatening as their simplicity to set up and high capacity. Stateful approaches of detection-based mitigation observe precise cache behaviors and protect specific cache lines that are suspected of being attacked. However, their recording structures incur large storage overhead and are vulnerable to reverse engineering attacks. Exploring the intrinsic non-determinate layout of a traditional Cuckoo filter, this paper proposes a space efficient Auto-Cuckoo filter to record access footprints, which succeed to decrease storage overhead and resist reverse engineering attacks at the same time. With Auto-Cuckoo filter, we propose PiPoMonitor to detect Ping-Pong patterns and prefetch specific cache line to interfere with adversaries' cache probes. Security analysis shows the PiPoMonitor can effectively mitigate cross-core attacks and the Auto-Cuckoo filter is immune to reverse engineering attacks. Evaluation results indicate PiPoMonitor has negligible impact on performance and the storage overhead is only 0.37%, an order of magnitude lower than previous stateful approaches.}, journal={PROCEEDINGS OF THE 2021 DESIGN, AUTOMATION & TEST IN EUROPE CONFERENCE & EXHIBITION (DATE 2021)}, author={Yuan, Fengkai and Wang, Kai and Hou, Rui and Li, Xiaoxin and Li, Peinan and Zhao, Lutan and Ying, Jiameng and Awad, Amro and Meng, Dan}, year={2021}, pages={1697–1702} } @article{alasad_lin_yuan_fan_awad_2021, title={Resilient and Secure Hardware Devices Using ASL}, volume={17}, ISSN={["1550-4840"]}, DOI={10.1145/3429982}, abstractNote={Due to the globalization of Integrated Circuit (IC) design in the semiconductor industry and the outsourcing of chip manufacturing, Third-Party Intellectual Properties (3PIPs) become vulnerable to IP piracy, reverse engineering, counterfeit IC, and hardware trojans. To thwart such attacks, ICs can be protected using logic encryption techniques. However, strong resilient techniques incur significant overheads. Side-channel attacks (SCAs) further complicate matters by introducing potential attacks post fabrication. One of the most severe SCAs is power analysis (PA) attacks, in which an attacker can observe the power variations of the device and analyze them to extract the secret key. PA attacks can be mitigated via adding large extra hardware; however, the overheads of such solutions can render them impractical, especially when there are power and area constraints. All Spin Logic Device (ASLD) is one of the most promising spintronic devices due to its unique properties: small area, no spin-charge signal conversion, zero leakage current, non-volatile memory, high density, low operating voltage, and its compatibility with conventional CMOS technology. In this article, we extend the work in Reference [1] on the usage of ASLD to produce secure and resilient circuits that withstand IC attacks (during the fabrication) and PA attacks (after fabrication), including reverse engineering attacks. First, we show that ASLD has another unique feature: identical power dissipation through the switching operations, where such properties can be effectively used to prevent PA and IC attacks. We then evaluate the proposed ASLD-based on performance overheads and security guarantees.}, number={2}, journal={ACM JOURNAL ON EMERGING TECHNOLOGIES IN COMPUTING SYSTEMS}, author={Alasad, Qutaiba and Lin, Jie and Yuan, Jiann-Shuin and Fan, Deliang and Awad, Amro}, year={2021}, month={Feb} } @article{alasmary_anwar_abusnaina_alabduljabbar_abuhamad_wang_nyang_awad_mohaisen_2022, title={SHELLCORE: Automating Malicious IoT Software Detection Using Shell Commands Representation}, volume={9}, ISSN={["2327-4662"]}, DOI={10.1109/JIOT.2021.3086398}, abstractNote={The Linux shell is a command-line interpreter that provides users with a command interface to the operating system, allowing them to perform various functions. Although very useful in building capabilities at the edge, the Linux shell can be exploited, giving adversaries a prime opportunity to use them for malicious activities. With access to Internet of Things (IoT) devices, malware authors can abuse the Linux shell of those devices to propagate infections and launch large-scale attacks, e.g., Distributed Denial of Service. In this work, we provide a first look at the tasks managed by shell commands in Linux-based IoT malware toward detection. We analyze malicious shell commands found in IoT malware and build a neural network-based model, ShellCore, to detect malicious shell commands. Namely, we collected a large data set of shell commands, including malicious commands extracted from 2891 IoT malware samples and benign commands collected from real-world network traffic analysis and volunteered data from Linux users. Using conventional machine and deep learning-based approaches trained with a term- and character-level features, ShellCore is shown to achieve an accuracy of more than 99% in detecting malicious shell commands and files (i.e., binaries).}, number={4}, journal={IEEE INTERNET OF THINGS JOURNAL}, author={Alasmary, Hisham and Anwar, Afsah and Abusnaina, Ahmed and Alabduljabbar, Abdulrahman and Abuhamad, Mohammed and Wang, An and Nyang, Daehun and Awad, Amro and Mohaisen, David}, year={2022}, month={Feb}, pages={2485–2496} } @article{chowdhuryy_ewetz_awad_yao_2021, title={Seeds of SEED: R-SAW: New Side Channels Exploiting Read Asymmetry in MLC Phase Change Memories}, DOI={10.1109/SEED51797.2021.00013}, abstractNote={Phase Change Memory (PCM) is a promising contender for future main memory solutions. While many architecture-level performance optimizations have been studied for PCM, the security implications of these designs are not well understood. This work demonstrates the first investigation of information leakage threats in PCM-based main memories. Notably, we find state-of-the-art read techniques leveraging access latency asymmetry in Multi-level Cell (MLC) PCM introduce new timing variations. To understand the severity of the vulnerability, we present R-SAW, a novel side channel attack that aims to exfiltrate secrets from a victim process via passively observing execution timings that are correlated with secret-dependent PCM accesses. We demonstrate the attack on a real-world cryptographic algorithm–AES encryption in OpenSSL. Our evaluation shows that R-SAW is able to completely recover the encryption keys. Furthermore, our experiments reveal that R-SAW exhibits superior noise resilience compared to the widely-studied cachebased side channels. Our work highlights the importance of understanding security in systems integrated with emerging memory technologies and motivates the need to architect secure-by-design PCM main memories in the future.}, journal={2021 INTERNATIONAL SYMPOSIUM ON SECURE AND PRIVATE EXECUTION ENVIRONMENT DESIGN (SEED 2021)}, author={Chowdhuryy, Md Hafizul Islam and Ewetz, Rickard and Awad, Amro and Yao, Fan}, year={2021}, pages={22–28} } @article{alwadi_kommareddy_hughes_hammond_awad_2021, title={Stealth-Persist: Architectural Support for Persistent Applications in Hybrid Memory Systems}, ISSN={["1530-0897"]}, DOI={10.1109/HPCA51647.2021.00022}, abstractNote={Non-volatile memories (NVMs) have the characteristics of both traditional storage systems (persistent) and traditional memory systems (byte-addressable). However, they suffer from high write latency and have a limited write endurance. Researchers have proposed hybrid memory systems that combine DRAM and NVM, utilizing the lower latency of the DRAM to hide some of the shortcomings of the NVM – improving system’s performance by caching resident NVM data in the DRAM. However, this can nullify the persistency of the cached pages, leading to a question of trade-offs in terms of performance and reliability. In this paper, we propose Stealth-Persist, a novel architecture support feature that allows applications that need persistence to run in the DRAM while maintaining the persistency features provided by the NVM. Stealth-Persist creates the illusion of a persistent memory for the application to use, while utilizing the DRAM for performance optimizations. Our experimental results show that Stealth-Persist improves the performance by 42.02% for persistent applications.}, journal={2021 27TH IEEE INTERNATIONAL SYMPOSIUM ON HIGH-PERFORMANCE COMPUTER ARCHITECTURE (HPCA 2021)}, author={Alwadi, Mazen and Kommareddy, Vamsee Reddy and Hughes, Clayton and Hammond, Simon David and Awad, Amro}, year={2021}, pages={139–152} } @misc{arafat_guo_awad_2021, title={VR-Spy: A Side-Channel Attack on Virtual Key-Logging in VR Headsets}, url={http://dx.doi.org/10.1109/vr50410.2021.00081}, DOI={10.1109/VR50410.2021.00081}, abstractNote={In Virtual Reality (VR), users typically interact with the virtual world using virtual keyboard to insert keywords, surfing the webpages, or typing passwords to access online accounts. Hence, it becomes imperative to understand the security of virtual keystrokes. In this paper, we present VR-Spy, a virtual keystrokes recognition method using channel state information (CSI) of WiFi signals. To the best of our knowledge, this is the first work that uses WiFi signals to recognize virtual keystrokes in VR headsets. The key idea behind VR -Spy is that the side-channel information of fine-granular hand movements associated with each virtual keystroke has a unique gesture pattern in the CSI waveforms. Our novel pattern extraction algorithm leverages signal processing techniques to extract the patterns from the variations of CSI. We implement VR-Spy using two Commercially Off-The-Shelf (COTS) devices, a transmitter (WAVLINK router), and a receiver (Intel NUC with an IWL 5300 NIC). Finally, VR-Spy achieves a virtual keystrokes recognition accuracy of 69.75% in comparison to techniques that assume very advanced adversary models with vision and motion sensors near the victim.}, journal={2021 IEEE Virtual Reality and 3D User Interfaces (VR)}, publisher={IEEE}, author={Arafat, Abdullah Al and Guo, Zhishan and Awad, Amro}, year={2021}, month={Mar}, pages={564–572} } @article{che_yang_awad_wang_2020, title={A Lightweight Memory Access Pattern Obfuscation Framework for NVM}, volume={19}, ISSN={["1556-6064"]}, DOI={10.1109/LCA.2020.3041484}, abstractNote={Emerging Non-Volatile Memories (NVMs) are entering the mainstream market. With attractive performance, high density, and near-zero idle power, emerging NVMs are promising contenders to build future memory systems. On the other hand, their limited write endurance ($10^6$106 to $10^8$108 write cycles) and enabling data remanence attacks remain as main challenges that could hinder the wide adoption of NVMs in many sectors. With the limited write-endurance of NVMs, implementing major security primitives, such as Oblivious RAM (ORAM) for memory access pattern obfuscation, become more challenging due to a massive extra-write operation demand, which will exacerbate the write endurance problem. Wear-leveling techniques aim at mitigating the endurance problem of NVM by shuffling locations and continuously changing the mappings of memory addresses to the actual memory cells. However, such wear-leveling techniques do not provide the access pattern obfuscation for security. In this letter, we propose a lightweight memory access pattern obfuscation framework for NVM, called O-NVM, a novel design that achieves both wear-leveling and obfuscation with an appealing level of performance and security (obfuscation) guarantees. Our experimental results show that the NVM's lifetime can be improved by $4-125\times$4-125×, and the randomness of address access patterns can achieve the passing rate of statistics tests over 99 percent.}, number={2}, journal={IEEE COMPUTER ARCHITECTURE LETTERS}, author={Che, Yuezhi and Yang, Yuanzhou and Awad, Amro and Wang, Rujia}, year={2020}, pages={163–166} } @article{awad_wang_2020, title={Guest Editors' Introduction to the Special Issue on Hardware Security}, volume={69}, ISSN={["1557-9956"]}, DOI={10.1109/TC.2020.3021223}, abstractNote={The twelve papers in this special section focus on hardware security. This topic is becoming a significant challenge in modern computing systems. Recently discovered hardware vulnerabilities, such as Spectre and Meltdown, are striking evidence that today’s computing systems are untenable without deliberate consideration of the security aspects at the design time. The papers address various topics related to hardware security: secure-by-design architectures, secure speculative execution, secure system integration of untrusted chiplets, malware detection, program analysis using power side channels, architecture support for forensics, and efficient implementations of security modules.}, number={11}, journal={IEEE TRANSACTIONS ON COMPUTERS}, author={Awad, Amro and Wang, Rujia}, year={2020}, month={Nov}, pages={1556–1557} } @article{alwadi_abu zubair_mohaisen_awad_2022, title={Phoenix: Towards Ultra-Low Overhead, Recoverable, and Persistently Secure NVM}, volume={19}, ISSN={["1941-0018"]}, DOI={10.1109/TDSC.2020.3020085}, abstractNote={Emerging Non-Volatile Memories (NVMs) bring a unique challenge to the security community, namely persistent security. As NVM-based memories are expected to restore their data after recovery, the security metadata must be recovered as well. However, persisting all affected security metadata on each memory write would significantly degrade performance and exacerbate the write endurance problem. On the other hand, relying on an encryption counters recovery scheme would take hours to rebuild the integrity tree, and will not be sufficient to rebuild the Tree-of-Counters (ToC). Due to intermediate nodes dependencies it is not possible to recover this type of trees using the encryption counters. To ensure recoverability, all updates to the security metadata must be persisted, which can be tens of additional writes on each write. In this article, we propose Phoenix, a practical novel scheme which relies on elegantly reproducing the cache content before a crash, however with minimal overheads. Our evaluation results show that Phoenix reduces persisting security metadata overhead writes to 3.8 percent less than a write-back encrypted system without recovery, thus improving the NVM lifetime by 8x. Overall Phoenix performance is better than the baseline.}, number={2}, journal={IEEE TRANSACTIONS ON DEPENDABLE AND SECURE COMPUTING}, author={Alwadi, Mazen and Abu Zubair, Kazi and Mohaisen, David and Awad, Amro}, year={2022}, pages={1049–1063} } @article{awad_manadhata_haber_solihin_horne_2016, title={Silent Shredder: Zero-Cost Shredding for Secure Non-Volatile Main Memory Controllers}, volume={51}, ISSN={["1558-1160"]}, DOI={10.1145/2954679.2872377}, abstractNote={As non-volatile memory (NVM) technologies are expected to replace DRAM in the near future, new challenges have emerged. For example, NVMs have slow and power-consuming writes, and limited write endurance. In addition, NVMs have a data remanence vulnerability, i.e., they retain data for a long time after being powered off. NVM encryption alleviates the vulnerability, but exacerbates the limited endurance by increasing the number of writes to memory. We observe that, in current systems, a large percentage of main memory writes result from data shredding in operating systems, a process of zeroing out physical pages before mapping them to new processes, in order to protect previous processes' data. In this paper, we propose Silent Shredder, which repurposes initialization vectors used in standard counter mode encryption to completely eliminate the data shredding writes. Silent Shredder also speeds up reading shredded cache lines, and hence reduces power consumption and improves overall performance. To evaluate our design, we run three PowerGraph applications and 26 multi-programmed workloads from the SPEC 2006 suite, on a gem5-based full system simulator. Silent Shredder eliminates an average of 48.6% of the writes in the initialization and graph construction phases. It speeds up main memory reads by 3.3 times, and improves the number of instructions per cycle (IPC) by 6.4% on average. Finally, we discuss several use cases, including virtual machines' data isolation and user-level large data initialization, where Silent Shredder can be used effectively at no extra cost.}, number={4}, journal={ACM SIGPLAN NOTICES}, author={Awad, Amro and Manadhata, Pratyusa and Haber, Stuart and Solihin, Yan and Horne, William}, year={2016}, month={Apr}, pages={263–276} } @inproceedings{awad_manadhata_haber_solihin_horne_2016, title={Silent shredder: Zero-cost shredding for secure non-volatile main memory controllers}, volume={50}, DOI={10.1145/2954680.2872377}, abstractNote={As non-volatile memory (NVM) technologies are expected to replace DRAM in the near future, new challenges have emerged. For example, NVMs have slow and power-consuming writes, and limited write endurance. In addition, NVMs have a data remanence vulnerability, i.e., they retain data for a long time after being powered off. NVM encryption alleviates the vulnerability, but exacerbates the limited endurance by increasing the number of writes to memory. We observe that, in current systems, a large percentage of main memory writes result from data shredding in operating systems, a process of zeroing out physical pages before mapping them to new processes, in order to protect previous processes' data. In this paper, we propose Silent Shredder, which repurposes initialization vectors used in standard counter mode encryption to completely eliminate the data shredding writes. Silent Shredder also speeds up reading shredded cache lines, and hence reduces power consumption and improves overall performance. To evaluate our design, we run three PowerGraph applications and 26 multi-programmed workloads from the SPEC 2006 suite, on a gem5-based full system simulator. Silent Shredder eliminates an average of 48.6% of the writes in the initialization and graph construction phases. It speeds up main memory reads by 3.3 times, and improves the number of instructions per cycle (IPC) by 6.4% on average. Finally, we discuss several use cases, including virtual machines' data isolation and user-level large data initialization, where Silent Shredder can be used effectively at no extra cost.}, number={2}, booktitle={Operating Systems Review}, author={Awad, A. and Manadhata, P. and Haber, S. and Solihin, Y. and Horne, W.}, year={2016}, pages={263–276} } @inproceedings{awad_kettering_solihin_2015, title={Non-volatile memory host controller interface performance analysis in high-performance I/O systems}, DOI={10.1109/ispass.2015.7095793}, abstractNote={Emerging non-volatile memories (NVMs), such as Phase-Change Memory (PCM), Spin-Transfer Torque RAM (STT-RAM) and Memristor, are very promising candidates for replacing NAND-Flash Solid-State Drives (SSDs) and Hard Disk Drives (HDDs) for many reasons. First, their read/write latencies are orders of magnitude faster. Second, some emerging NVMs, such as memristors, are expected to have very high densities, which allow deploying a much higher capacity without requiring increased physical space. While the percentage of the time taken for data movement over low-speed buses, such as Peripheral Component Interconnect (PCI), is negligible for the overall read/write latency in HDDs, it could be dominant for emerging fast NVMs. Therefore, the trend has moved toward using very fast interconnect technologies, such as PCI Express (PCIe) which is hundreds of times faster than the traditional PCI. Accordingly, new host controller interfaces are used to communicate with I/O devices to exploit the parallelism and low-latency features of emerging NVMs through high-speed interconnects. In this paper, we investigate the system performance bottlenecks and overhead of using the standard state-of-the-art Non-Volatile Memory Express (NVMe), or Non-Volatile Memory Host Controller Interface (NVMHCI) Specification [1] as representative for NVM host controller interfaces.}, booktitle={Ieee international symposium on performance analysis of systems and}, author={Awad, A. and Kettering, B. and Solihin, Y.}, year={2015}, pages={145–154} } @inproceedings{awad_solihin_2014, title={STM : Cloning the spatial and temporal memory access behavior}, DOI={10.1109/hpca.2014.6835935}, abstractNote={Computer architects need a deep understanding of clients' workload in order to design and tune the architecture. Unfortunately, many important clients will not share their software to computer architects due to the proprietary or confidential nature of their software. One technique to mitigate this problem is producing synthetic traces (clone) that replicate the behavior of the original workloads. Unfortunately, today there is no universal cloning technique that can capture arbitrary memory access behavior of applications. Existing technique captures only temporal, but not spatial, locality. In order to study memory hierarchy organization beyond caches, such as including prefetchers and translation lookaside buffer (TLB), capturing only temporal locality is insufficient. In this paper, we propose a new memory access behavior cloning technique that captures both temporal and spatial locality. We abbreviate our scheme as Spatio-Temporal Memory (STM) cloning. We propose a new profiling method and statistics that capture stride patterns and transition probabilities. We show how the new statistics enable accurate clone generation that allow clones to be used in place of the original benchmarks for studying the L1/L2/TLB miss rates as we vary the L1 cache, L1 prefetcher, L2 cache, TLB, and page size configurations.}, booktitle={International symposium on high-performance computer}, author={Awad, A. and Solihin, Y.}, year={2014}, pages={237–247} }