@article{karabulut_aysu_2024, title={A Hardware-Software Co-Design for the Discrete Gaussian Sampling of FALCON Digital Signature}, ISBN={["979-8-3503-7395-0"]}, ISSN={["2835-5709"]}, DOI={10.1109/HOST55342.2024.10545399}, abstractNote={Sampling random values from a discrete Gaussian distribution with high precision is a major and computationally-intensive operation of emerging and existing cryptographic standards. FALCON is one such algorithm that the National Institute of Standards and Technology chose to standardize as a next-generation, quantum-secure digital signature algorithm. The discrete Gaussian sampling of FALCON has both flexibility and efficiency needs–it constitutes 72% of total signature generation in reference software and requires sampling from a variable mean and standard deviation. Unfortunately, there are no prior works on accelerating this complete sampling procedure. In this paper, we propose a hardware-software co-design for accelerating FALCON's discrete Gaussian sampling subroutine. The proposed solution handles the flexible computations for setting the variable parameters in software and executes core operations with low latency, parameterized, and custom hardware. The hardware parameterization allows trading off area vs. performance. On a Xilinx SoC FPGA Architecture, the results show that compared to the reference software, our solution can accelerate the sampling up to 56.05× and the full signature scheme by 1.67×. Moreover, we quantified that our optimized multiplier circuits can improve the throughput over a straightforward implementation by 2.87×.}, journal={2024 IEEE INTERNATIONAL SYMPOSIUM ON HARDWARE ORIENTED SECURITY AND TRUST, HOST}, author={Karabulut, Emre and Aysu, Aydin}, year={2024}, pages={90–100} } @article{aydin_karabulut_aysu_2024, title={Extended Abstract: Pre-Silicon Vulnerability Assessment for AI/ML Hardware}, ISSN={["1066-1395"]}, DOI={10.1145/3649476.3660388}, journal={PROCEEDING OF THE GREAT LAKES SYMPOSIUM ON VLSI 2024, GLSVLSI 2024}, author={Aydin, Furkan and Karabulut, Emre and Aysu, Aydin}, year={2024}, pages={495–495} } @article{aydin_aysu_2024, title={Leaking secrets in homomorphic encryption with side-channel attacks}, ISSN={["2190-8516"]}, DOI={10.1007/s13389-023-00340-2}, journal={JOURNAL OF CRYPTOGRAPHIC ENGINEERING}, author={Aydin, Furkan and Aysu, Aydin}, year={2024}, month={Jan} } @article{dubey_cammarota_varna_kumar_aysu_2023, title={Hardware-Software Co-design for Side-Channel Protected Neural Network Inference}, ISSN={["2835-5709"]}, DOI={10.1109/HOST55118.2023.10133716}, abstractNote={Physical side-channel attacks are a major threat to stealing confidential data from devices. There has been a recent surge in such attacks on edge machine learning (ML) hardware to extract the model parameters. Consequently, there has also been work, although limited, on building corresponding defenses against such attacks. Current solutions take either fully software-or fully hardware-centric approaches, which are limited in performance and flexibility, respectively. In this paper, we propose the first hardware-software co-design solution for building side-channel-protected ML hardware. Our solution targets edge devices and addresses both performance and flexibility needs. To that end, we develop a secure RISCV-based coprocessor design that can execute a neural network implemented in C/C++. Our coprocessor uses masking to execute various neural network operations like weighted summations, activation functions, and output layer computation in a sidechannel secure fashion. We extend the original RV32I instruction set with custom instructions to control the masking gadgets inside the secure coprocessor. We further use the custom instructions to implement easy-to-use APIs that are exposed to the end-user as a shared library. Finally, we demonstrate the empirical sidechannel security of the design up to 1M traces.}, journal={2023 IEEE INTERNATIONAL SYMPOSIUM ON HARDWARE ORIENTED SECURITY AND TRUST, HOST}, author={Dubey, Anuj and Cammarota, Rosario and Varna, Avinash and Kumar, Raghavan and Aysu, Aydin}, year={2023}, pages={155–166} } @article{karabulut_awad_aysu_2023, title={SS-AXI: Secure and Safe Access Control Mechanism for Multi-Tenant Cloud FPGAs}, ISSN={["0271-4302"]}, DOI={10.1109/ISCAS46773.2023.10181609}, abstractNote={FPGAs are newly added to the cloud to offer energy-efficient acceleration. Multi-tenancy is an emerging phenomenon in cloud FPGAs to enable resource efficiency. In a multi-tenant scenario, multiple users can share the same FPGA fabric either spatially (i.e., tenants share different resources at the same time) or temporally (tenants share the same resources in different time slots). Undesired access or manipulation of other tenant's data can cause security and safety issues. Although safety/security concepts in access control policies have been thoroughly studied in conventional cloud systems, they are relatively unknown for cloud FPGAs. Moreover, these concepts may not trivially extend to cloud FPGAs due to their different nature. This paper proposes an improved access control mechanism for multi-tenant cloud FPGAs. Compared to existing commercial tools, our solution allows dynamic configuration of access control privileges. Compared to earlier academic proposals with dynamic configuration, the results show that our proposal has three advantages: (i) enabling secure resource sharing of on-chip BRAMs to tenants, (ii) enabling safe sharing by resolving deadlocks and faulty access requests, and (iii) improvement in latency and throughput.}, journal={2023 IEEE INTERNATIONAL SYMPOSIUM ON CIRCUITS AND SYSTEMS, ISCAS}, author={Karabulut, Emre and Awad, Amro and Aysu, Aydin}, year={2023} } @article{mert_karabulut_ozturk_savas_aysu_2022, title={An Extensive Study of Flexible Design Methods for the Number Theoretic Transform}, volume={71}, ISSN={["1557-9956"]}, DOI={10.1109/TC.2020.3017930}, abstractNote={Efficient lattice-based cryptosystems operate with polynomial rings with the Number Theoretic Transform (NTT) to reduce the computational complexity of polynomial multiplication. NTT has therefore become a major arithmetic component (thus computational bottleneck) in various cryptographic constructions like hash functions, key-encapsulation mechanisms, digital signatures, and homomorphic encryption. Although there exist several hardware designs in prior work for NTT, they all are isolated design instances fixed for specific NTT parameters or parallelization level. This article provides an extensive study of flexible design methods for NTT implementation. To that end, we evaluate three cases: (1) parametric hardware design, (2) high-level synthesis (HLS) design approach, and (3) design for software implementation compiled on soft-core processors, where all are targeted on reconfigurable hardware devices. We evaluate the designs that implement multiple NTT parameters and/or processing elements, demonstrate the design details for each case, and provide a fair comparison with each other and prior work. On a Xilinx Virtex-7 FPGA, compared to HLS and processor-based methods, the results show that the parametric hardware design is on average $4.4\times$4.4× and $73.9\times$73.9× smaller and $22.5\times$22.5× and $19.3\times$19.3× faster, respectively. Surprisingly, HLS tools can yield less efficient solutions than processor-based approaches in some cases.}, number={11}, journal={IEEE TRANSACTIONS ON COMPUTERS}, author={Mert, Ahmet Can and Karabulut, Emre and Ozturk, Erdinc and Savas, Erkay and Aysu, Aydin}, year={2022}, month={Nov}, pages={2829–2843} } @article{haas_aysu_2022, title={Apple vs. EMA Electromagnetic Side Channel Attacks on Apple CoreCrypto}, DOI={10.1145/3489517.3530437}, abstractNote={Cryptographic instruction set extensions are commonly used for ciphers which would otherwise face unacceptable side channel risks. A prominent example of such an extension is the ARMv8 Cryptographic Extension, or ARM CE for short, which defines dedicated instructions to securely accelerate AES. However, while these extensions may be resistant to traditional "digital" side channel attacks, they may still be vulnerable to physical side channel attacks. In this work, we demonstrate the first such attack on a standard ARM CE AES implementation. We specifically focus on the implementation used by Apple's CoreCrypto library which we run on the Apple A10 Fusion SoC. To that end, we implement an optimized side channel acquisition infrastructure involving both custom iPhone software and accelerated analysis code. We find that an adversary which can observe 5--30 million known-ciphertext traces can reliably extract secret AES keys using electromagnetic (EM) radiation as a side channel. This corresponds to an encryption operation on less than half of a gigabyte of data, which could be acquired in less than 2 seconds on the iPhone 7 we examined. Our attack thus highlights the need for side channel defenses for real devices and production, industry-standard encryption software.}, journal={PROCEEDINGS OF THE 59TH ACM/IEEE DESIGN AUTOMATION CONFERENCE, DAC 2022}, author={Haas, Gregor and Aysu, Aydin}, year={2022}, pages={247–252} } @article{gajjar_kashyap_aysu_franzon_dey_cheng_2022, title={FAXID: FPGA-Accelerated XGBoost Inference for Data Centers using HLS}, ISSN={["2576-2621"]}, url={http://dx.doi.org/10.1109/fccm53951.2022.9786085}, DOI={10.1109/FCCM53951.2022.9786085}, abstractNote={Advanced ensemble trees have proven quite effective in providing real-time predictions against ransomware detection, medical diagnosis, recommendation engines, fraud detection, failure predictions, crime risk, to name a few. Especially, XGBoost, one of the most prominent and widely used decision trees, has gained popularity due to various optimizations on gradient boosting framework that provides increased accuracy for classification and regression problems. XGBoost’s ability to train relatively faster, handling missing values, flexibility and parallel processing make it a better candidate to handle data center workload. Today’s data centers with enormous Input/Output Operations per Second (IOPS) demand a real-time accelerated inference with low latency and high throughput because of significant data processing due to applications such as ransomware detection or fraud detection.This paper showcases an FPGA-based XGBoost accelerator designed with High-Level Synthesis (HLS) tools and design flow accelerating binary classification inference. We employ Alveo U50 and U200 to demonstrate the performance of the proposed design and compare it with existing state-of-the-art CPU (Intel Xeon E5-2686 v4) and GPU (Nvidia Tensor Core T4) implementations with relevant datasets. We show a latency speedup of our proposed design over state-of-art CPU and GPU implementations, including energy efficiency and cost-effectiveness. The proposed accelerator is up to 65.8x and 5.3x faster, in terms of latency than CPU and GPU, respectively. The Alveo U50 is a more cost-effective device, and the Alveo U200 stands out as more energy-efficient.}, journal={2022 IEEE 30TH INTERNATIONAL SYMPOSIUM ON FIELD-PROGRAMMABLE CUSTOM COMPUTING MACHINES (FCCM 2022)}, publisher={IEEE}, author={Gajjar, Archit and Kashyap, Priyank and Aysu, Aydin and Franzon, Paul and Dey, Sumon and Cheng, Chris}, year={2022}, pages={113–121} } @article{dubey_cammarota_suresh_aysu_2022, title={Guarding Machine Learning Hardware Against Physical Side-channel Attacks}, volume={18}, ISSN={["1550-4840"]}, DOI={10.1145/3465377}, abstractNote={Machine learning (ML) models can be trade secrets due to their development cost. Hence, they need protection against malicious forms of reverse engineering (e.g., in IP piracy). With a growing shift of ML to the edge devices, in part for performance and in part for privacy benefits, the models have become susceptible to the so-called physical side-channel attacks. ML being a relatively new target compared to cryptography poses the problem of side-channel analysis in a context that lacks published literature. The gap between the burgeoning edge-based ML devices and the research on adequate defenses to provide side-channel security for them thus motivates our study. Our work develops and combines different flavors of side-channel defenses for ML models in the hardware blocks. We propose and optimize the first defense based on Boolean masking . We first implement all the masked hardware blocks. We then present an adder optimization to reduce the area and latency overheads. Finally, we couple it with a shuffle-based defense. We quantify that the area-delay overhead of masking ranges from 5.4× to 4.7× depending on the adder topology used and demonstrate a first-order side-channel security of millions of power traces. Additionally, the shuffle countermeasure impedes a straightforward second-order attack on our first-order masked implementation.}, number={3}, journal={ACM JOURNAL ON EMERGING TECHNOLOGIES IN COMPUTING SYSTEMS}, author={Dubey, Anuj and Cammarota, Rosario and Suresh, Vikram and Aysu, Aydin}, year={2022}, month={Jul} } @article{dubey_karabulut_awad_aysu_2022, title={High-Fidelity Model Extraction Attacks via Remote Power Monitors}, DOI={10.1109/AICAS54282.2022.9869973}, abstractNote={This paper shows the first side-channel attack on neural network (NN) IPs through a remote power monitor. We demonstrate that a remote monitor implemented with time-to-digital converters can be exploited to steal the weights from a hardware implementation of NN inference. Such an attack alleviates the need to have physical access to the target device and thus expands the attack vector to multi-tenant cloud FPGA platforms. Our results quantify the effectiveness of the attack on an FPGA implementation of NN inference and compare it to an attack with physical access. We demonstrate that it is indeed possible to extract the weights using DPA with 25000 traces if the SNR is sufficient. The paper, therefore, motivates secure virtualization-to protect the confidentiality of high-valued NN model IPs in multi-tenant execution environments, platform developers need to employ strong countermeasures against physical side-channel attacks.}, journal={2022 IEEE INTERNATIONAL CONFERENCE ON ARTIFICIAL INTELLIGENCE CIRCUITS AND SYSTEMS (AICAS 2022): INTELLIGENT TECHNOLOGY IN THE POST-PANDEMIC ERA}, author={Dubey, Anuj and Karabulut, Emre and Awad, Amro and Aysu, Aydin}, year={2022}, pages={328–331} } @article{potluri_kundu_kumar_basu_aysu_2023, title={SeqL plus : Secure Scan-Obfuscation With Theoretical and Empirical Validation}, volume={42}, ISSN={["1937-4151"]}, DOI={10.1109/TCAD.2022.3199153}, abstractNote={Scan-obfuscation is a powerful methodology to protect Silicon-based intellectual property from theft. Prior work on scan-obfuscation in the context of logic-locking have unique limitations, which are addressed by our previous work, SeqL, which looks at functional output corruption to obfuscate scan-chains, but is unable to resist removal attacks on circuits with inadequate number of flip-flops without feedback. To address this issue, we propose to scramble flip-flops with feedback to increase key length without introducing further vulnerabilities. This study reveals the first formulation and complexity analysis of Boolean satisfiability (SAT)-based attack on scan-scrambling. We formulate the attack as a conjunctive normal form (CNF) using a worst-case $\mathcal {O}(n^{3})$ reduction in terms of scramble-graph size $n$ . In order to defeat SAT-based attack, we propose an iterative swapping-based scan-cell scrambling algorithm that has $\mathcal {O}(n)$ implementation time-complexity and $\mathcal {O}(2^{\lfloor ({\alpha.n+1}/{3}) \rfloor })$ SAT-decryption time-complexity in terms of a user-configurable cost constraint $\alpha ~(0 < \alpha \le 1)$ .}, number={5}, journal={IEEE TRANSACTIONS ON COMPUTER-AIDED DESIGN OF INTEGRATED CIRCUITS AND SYSTEMS}, author={Potluri, Seetal and Kundu, Shamik and Kumar, Akash and Basu, Kanad and Aysu, Aydin}, year={2023}, month={May}, pages={1406–1410} } @article{sayadi_aliasgari_aydin_potluri_aysu_edmonds_tehranipoor_2022, title={Towards AI-Enabled Hardware Security: Challenges and Opportunities}, ISSN={["1942-9398"]}, DOI={10.1109/IOLTS56730.2022.9897507}, abstractNote={Recent developments in Artificial Intelligence (AI) and Machine Learning (ML), driven by a substantial increase in the size of data in emerging computing systems, have led into successful applications of such intelligent techniques in various disciplines including security. Traditionally, integrity of data has been protected with various security protocols at the software level with the underlying hardware assumed to be secure. This assumption however is no longer true with an increasing number of attacks reported on the hardware. The emergence of new security threats (e.g., malware, side-channel attacks, etc.) requires patching/updating the software-based solutions that needs a vast amount of memory and hardware resources. Therefore, the security should be delegated to the underlying hardware, building a bottom-up solution for securing computing devices rather than treating it as an afterthought. This paper highlights the growing role of AI/ML techniques in hardware and architecture security field and provides insightful discussions on pressing challenges, opportunities, and future directions of designing accurate and efficient machine learning-based attacks and defense mechanisms in response to emerging hardware security vulnerabilities in modern computer systems and next generation of cryptosystems.}, journal={2022 IEEE 28TH INTERNATIONAL SYMPOSIUM ON ON-LINE TESTING AND ROBUST SYSTEM DESIGN (IOLTS 2022)}, author={Sayadi, Hossein and Aliasgari, Mehrdad and Aydin, Furkan and Potluri, Seetal and Aysu, Aydin and Edmonds, Jack and Tehranipoor, Sara}, year={2022} } @article{chen_karabulut_aysu_ma_jing_2021, title={An Efficient Non-Profiled Side-Channel Attack on the CRYSTALS-Dilithium Post-Quantum Signature}, ISSN={["1063-6404"]}, DOI={10.1109/ICCD53106.2021.00094}, abstractNote={Post-quantum digital signature is a critical primitive of computer security in the era of quantum hegemony. As a finalist of the post-quantum cryptography standardization process, the theoretical security of the CRYSTALS-Dilithium (Dilithium) signature scheme has been quantified to withstand classical and quantum cryptanalysis. However, there is an inherent power side-channel information leakage in its implementation instance due to the physical characteristics of hardware.This work proposes an efficient non-profiled Correlation Power Analysis (CPA) strategy on Dilithium to recover the secret key by targeting the underlying polynomial multiplication arithmetic. We first develop a conservative scheme with a reduced key guess space, which can extract a secret key coefficient with a 99.99% confidence using 157 power traces of the reference Dilithium implementation. However, this scheme suffers from the computational overhead caused by the large modulus in Dilithium signature. To further accelerate the CPA run-time, we propose a fast two-stage scheme that selects a smaller search space and then resolves false positives. We finally construct a hybrid scheme that combines the advantages of both schemes. Real-world experiment on the power measurement data shows that our hybrid scheme improves the attack’s execution time by 7.77×.}, journal={2021 IEEE 39TH INTERNATIONAL CONFERENCE ON COMPUTER DESIGN (ICCD 2021)}, author={Chen, Zhaohui and Karabulut, Emre and Aysu, Aydin and Ma, Yuan and Jing, Jiwu}, year={2021}, pages={583–590} } @article{karabulut_alkim_aysu_2021, title={Efficient, Flexible, and Constant-Time Gaussian Sampling Hardware for Lattice Cryptography}, volume={71}, ISSN={["1557-9956"]}, DOI={10.1109/TC.2021.3107729}, abstractNote={This paper proposes a discrete Gaussian sampling hardware design that can flexibly support different sampling parameters, that is more efficient (in area-delay product) compared to the majority of earlier proposals, and that has constant execution time. The proposed design implements a Cumulative Distribution Table (CDT) approach, reduces the table size with Gaussian convolutions, and adopts an innovative fusion tree search algorithm to achieve a compact and fast sampling technique—to our best knowledge, this is the first hardware implementation of fusion tree search algorithm. The proposed hardware can support all the discrete Gaussian distributions used in post-quantum digital signatures and key encapsulation algorithms (FALCON, qTESLA, and FrodoKEM), the homomorphic encryption library of SEAL, and other algorithms such BLISS digital signature and LP public-key encryption. Our proposed hardware can be configured at design-time to optimize a single configuration or at run-time to support multiple Gaussian distribution parameters. Our design, furthermore, has constant-time behavior by design, eliminating timing side-channel attacks—this is achieved by reading all table contents at the same time to also reduce the latency. The results on a Xilinx Virtex-7 FPGA show that our solution can outperform all prior proposals in area-delay product by 1.67–235.88×, only falling short to those designed for the LP encryption scheme.}, number={8}, journal={IEEE TRANSACTIONS ON COMPUTERS}, author={Karabulut, Emre and Alkim, Erdem and Aysu, Aydin}, year={2021}, month={Aug}, pages={1810–1823} } @article{karabulut_aysu_2021, title={FALCON Down: Breaking FALCON Post-Quantum Signature Scheme through Side-Channel Attacks}, ISSN={["0738-100X"]}, DOI={10.1109/DAC18074.2021.9586131}, abstractNote={This paper proposes the first side-channel attack on FALCON—a NIST Round-3 finalist for the post-quantum digital signature standard. We demonstrate a known-plaintext attack that uses the electromagnetic measurements of the device to extract the secret signing keys, which then can be used to forge signatures on arbitrary messages. The proposed attack targets the unique floating-point multiplications within FALCON’s Fast Fourier Transform through a novel extend-and-prune strategy that extracts the sign, mantissa, and exponent variables without false positives. The extracted floating-point values are then mapped back to the secret key’s coefficients. Our attack, notably, does not require pre-characterizing the power profile of the target device or crafting special inputs. Instead, the statistical differences on obtained traces are sufficient to successfully execute our proposed differential electromagnetic analysis. The results on an ARM-Cortex-M4 running the FALCON NIST’s reference software show that approximately 10k measurements are sufficient to extract the entire key.}, journal={2021 58TH ACM/IEEE DESIGN AUTOMATION CONFERENCE (DAC)}, author={Karabulut, Emre and Aysu, Aydin}, year={2021}, pages={691–696} } @article{karabulut_alkim_aysu_2021, title={Single-Trace Side-Channel Attacks on omega-Small Polynomial Sampling}, DOI={10.1109/HOST49136.2021.9702284}, abstractNote={This paper proposes a new single-trace side-channel attack on lattice-based post-quantum protocols. We target the ω-small polynomial sampling of NTRU, NTRU Prime, and CRYSTALS-DILITHIUM algorithm implementations (which are NIST Round-3 finalist and alternative candidates), and we demonstrate the vulnerabilities of their sub-routines to a power-based side-channel attack. Specifically, we reveal that the sorting implementation in NTRU/NTRU Prime and the shuffling in CRYSTALS-DILITHIUM's ω-small polynomial sampling process leaks information about the ‘-1’’0’, or ’+1' assignments made to the coefficients. We further demonstrate that these assignments can be found within a single power measurement and that revealing them allows secret and session key recovery for NTRU/NTRU Prime, while reducing the challenge polynomial's entropy for CRYSTALS-DILITHIUM. We execute our proposed attacks on an ARM Cortex-M4 microcontroller running the reference software submissions from NIST Round-3 software packages. The results show that our attacks can extract coefficients with a success rate of 99.78% for NTRU and NTRU Prime, reducing the search space to 241 or below. For CRYSTALS-DILITHIUM, our attack recovers the coefficients’ signs with over 99.99% success, reducing rejected challenge polynomials’ entropy between 39 to 60 bits. Our work informs the proposers about the single-trace vulnerabilities of their software and urges them to develop single-trace resilient software for low-cost microcontrollers.}, journal={2021 IEEE INTERNATIONAL SYMPOSIUM ON HARDWARE ORIENTED SECURITY AND TRUST (HOST)}, author={Karabulut, Emre and Alkim, Erdem and Aysu, Aydin}, year={2021}, pages={35–45} } @article{potluri_aysu_2021, title={Stealing Neural Network Models through the Scan Chain: A New Threat for ML Hardware}, ISSN={["1933-7760"]}, DOI={10.1109/ICCAD51958.2021.9643547}, abstractNote={Stealing trained machine learning (ML) models is a new and growing concern due to the model's development cost. Existing work on ML model extraction either applies a mathematical attack or exploits hardware vulnerabilities such as side-channel leakage. This paper shows a new style of attack, for the first time, on ML models running on embedded devices by abusing the scan-chain infrastructure. We illustrate that having course-grained scan-chain access to non-linear layer outputs is sufficient to steal ML models. To that end, we propose a novel small-signal analysis inspired attack that applies small perturbations into the input signals, identifies the quiescent operating points and, selectively activates certain neurons. We then couple this with a Linear Constraint Satisfaction based approach to efficiently extract model parameters such as weights and biases. We conduct our attack on neural network inference topologies defined in earlier works, and we automate our attack. The results show that our attack outperforms mathematical model extraction proposed in CRYPTO 2020, USENIX 2020, and ICML 2020 by an increase in accuracy of $2^{20.7}\times, 2^{50.7}\times$, and $2^{33.9}\times$, respectively, and a reduction in queries by $2^{6.5}\times, 2^{4.6}\times$, and $2^{14.2}\times$, respectively.}, journal={2021 IEEE/ACM INTERNATIONAL CONFERENCE ON COMPUTER AIDED DESIGN (ICCAD)}, author={Potluri, Seetal and Aysu, Aydin}, year={2021} } @article{haas_potluri_aysu_2021, title={iTimed: Cache Attacks on the Apple A10 Fusion SoC}, DOI={10.1109/HOST49136.2021.9702290}, abstractNote={This paper proposes the first cache timing side-channel attack on one of Apple's mobile devices. Utilizing a recent, permanent exploit named checkm8, we reverse-engineered Apple's BootROM and created a powerful toolkit for running arbitrary hardware security experiments on Apple's in-house designed ARM systems-on-a-chip (SoC). Using this toolkit, we then implement an access-driven cache timing attack (in the style of PRIME+PROBE) as a proof-of-concept illustrator. The advanced hardware control enabled by our toolkit allowed us to reverse-engineer key microarchitectural details of the Apple A10 Fusion's memory hierarchy. We find that the SoC employs a randomized cache-line replacement policy as well as a hardware-based L1 prefetcher. We propose statistical innovations which specifically account for these hardware structures and thus further the state-of-the-art in cache timing attacks. We find that our access-driven attack, at best, can reduce the security of OpenSSL AES-128 by 50 more bits than a straightforward adaptation of PRIME+PROBE, while requiring only half as many side channel measurement traces.}, journal={2021 IEEE INTERNATIONAL SYMPOSIUM ON HARDWARE ORIENTED SECURITY AND TRUST (HOST)}, author={Haas, Gregor and Potluri, Seetal and Aysu, Aydin}, year={2021}, pages={80–90} } @article{kashyap_aydin_potluri_franzon_aysu_2021, title={2Deep: Enhancing Side-Channel Attacks on Lattice-Based Key-Exchange via 2-D Deep Learning}, volume={40}, ISSN={["1937-4151"]}, url={http://dx.doi.org/10.1109/tcad.2020.3038701}, DOI={10.1109/TCAD.2020.3038701}, abstractNote={Advancements in quantum computing present a security threat to classical cryptography algorithms. Lattice-based key exchange protocols show strong promise due to their resistance to theoretical quantum-cryptanalysis and low implementation overhead. By contrast, their physical implementations have shown vulnerability against side-channel attacks (SCAs) even with a single power measurement. The state-of-the-art SCAs are, however, limited to simple, sequentialized executions of post-quantum key-exchange (PQKE) protocols, leaving the vulnerability of complex, parallelized architectures unknown. This article proposes 2Deep—a deep-learning (DL)-based SCA—targeting parallelized implementations of PQKE protocols, namely, Frodo and NewHope with data augmentation techniques. Specifically, we explore approaches that convert 1-D time-series power measurement data into 2-D images to formulate SCA an image recognition task. The results show our attack’s superiority over conventional techniques including horizontal differential power analysis (DPA), template attacks (TAs), and straightforward DL approaches. We demonstrate improvements up to $1.5\times $ to recover a 100% success rate compared to DL with 1-D input data while using fewer data. We furthermore show that machine learning improves the results up to $1.25\times $ compared to TAs. Furthermore, we perform cross-device attacks that obtain profiles from a single device, which has never been explored. Our 2-D approach is especially favored in this setting, improving the success rate of attacking Frodo from 20% to 99% compared to the 1-D approach. Our work thus urges countermeasures even on parallel architectures and single-trace attacks.}, number={6}, journal={IEEE TRANSACTIONS ON COMPUTER-AIDED DESIGN OF INTEGRATED CIRCUITS AND SYSTEMS}, publisher={Institute of Electrical and Electronics Engineers (IEEE)}, author={Kashyap, Priyank and Aydin, Furkan and Potluri, Seetal and Franzon, Paul D. and Aysu, Aydin}, year={2021}, month={Jun}, pages={1217–1229} } @article{dubey_cammarota_aysu_2020, title={BoMaNet: Boolean Masking of an Entire Neural Network}, ISSN={["1933-7760"]}, DOI={10.1145/3400302.3415649}, abstractNote={Recent work on stealing machine learning (ML) models from inference engines with physical side-channel attacks warrant an urgent need for effective side-channel defenses. This work proposes the first fully-masked neural network inference engine design. Masking uses secure multi-party computation to split the secrets into random shares and to decorrelate the statistical relation of secret-dependent computations to side-channels (e.g., the power draw). In this work, we construct secure hardware primitives to mask all the linear and non-linear operations in a neural network. We address the challenge of masking integer addition by converting each addition into a sequence of XOR and AND gates and by augmenting Trichina's secure Boolean masking style. We improve the traditional Trichina's AND gates by adding pipelining elements for better glitch-resistance and we architect the whole design to sustain a throughput of 1 masked addition per cycle. We implement the proposed secure inference engine on a Xilinx Spartan-6 (XC6SLX75) FPGA. The results show that masking incurs an overhead of 3.5% in latency and 5.9× in area. Finally, we demonstrate the security of the masked design with 2M traces.}, journal={2020 IEEE/ACM INTERNATIONAL CONFERENCE ON COMPUTER AIDED-DESIGN (ICCAD)}, author={Dubey, Anuj and Cammarota, Rosario and Aysu, Aydin}, year={2020} } @article{ozcan_aysu_2020, title={High-Level Synthesis of Number-Theoretic Transform: A Case Study for Future Cryptosystems}, volume={12}, ISSN={["1943-0671"]}, DOI={10.1109/LES.2019.2960457}, abstractNote={Compared to traditional hardware development methodologies, high-level synthesis (HLS) offers a faster time-to-market and lower design-cost at the expense of implementation efficiency. Although HLS tools are becoming popular in some applications, such as digital signal processing and neural network classification, their usability on cryptographic applications is largely unexplored. This feasibility is critical especially for cryptosystems that are under development, such as the next-generation public-key cryptosystems needed for quantum-resistance. This letter provides a thorough investigation of HLS on number theoretic transform (NTT)—the core arithmetic function of lattice-based quantum-resistant cryptosystems. We demonstrate a fast yet extensive design space exploration of NTT through the Vivado HLS tool, analyze the shortcomings/challenges of optimized configurations, and quantitatively compare the results to software-based and hand-coded hardware designs.}, number={4}, journal={IEEE EMBEDDED SYSTEMS LETTERS}, author={Ozcan, Erdem and Aysu, Aydin}, year={2020}, month={Dec}, pages={133–136} } @article{regazzoni_bhasin_pour_alshaer_aydin_aysu_beroulle_di natale_franzon_hely_et al._2020, title={Machine Learning and Hardware security: Challenges and Opportunities -Invited Talk}, ISSN={["1933-7760"]}, DOI={10.1145/3400302.3416260}, abstractNote={Machine learning techniques have significantly changed our lives. They helped improving our everyday routines, but they also demonstrated to be an extremely helpful tool for more advanced and complex applications. However, the implications of hardware security problems under a massive diffusion of machine learning techniques are still to be completely understood. This paper first highlights novel applications of machine learning for hardware security, such as evaluation of post quantum cryptography hardware and extraction of physically unclonable functions from neural networks. Later, practical model extraction attack based on electromagnetic side-channel measurements are demonstrated followed by a discussion of strategies to protect proprietary models by watermarking them.}, journal={2020 IEEE/ACM INTERNATIONAL CONFERENCE ON COMPUTER AIDED-DESIGN (ICCAD)}, author={Regazzoni, Francesco and Bhasin, Shivam and Pour, Amir Ali and Alshaer, Ihab and Aydin, Furkan and Aysu, Aydin and Beroulle, Vincent and Di Natale, Giorgio and Franzon, Paul and Hely, David and et al.}, year={2020} } @article{karabulut_aysu_2020, title={RANTT: A RISC-V Architecture Extension for the Number Theoretic Transform}, ISSN={["1946-1488"]}, DOI={10.1109/FPL50879.2020.00016}, abstractNote={Lattice-based cryptography has been growing in demand due to their quantum attack resiliency. Polynomial multiplication is a major computational bottleneck of lattice cryptosystems. To address the challenge, lattice-based cryptosystems use the Number Theoretic Transform (NTT). Although NTT reduces complexity, it is still a well-known computational bottleneck. At the same time, NTT arithmetic needs vary for different algorithms, motivating flexible solutions. Although there are prior hardware and software NTT designs, they do not simultaneously offer flexibility and efficiency. This work provides an efficient and flexible NTT solution through domain-specific architectural support on RISC-V. Rather than using instruction-set extensions with compiler modifications or loosely coupling a RISC-V core with an NTT co-processor, our proposal uses application-specific dynamic instruction scheduling, memory dependence prediction, and datapath optimizations. This allows achieving a direct translation of C code to optimized NTT executions. We demonstrate the flexibility of our approach by implementing the NTT used in several lattice-based cryptography protocols: NewHope, qTESLA, CRYSTALS-Kyber, CRYSTALS-Dilithium, and Falcon. The results on the FPGA technology show that the proposed design is respectively 6x, 40x, and 3x more efficient than the baseline solution, Berkeley Out-of-Order Machine, and a prior HW/SW co-design, while providing the needed flexibility.}, journal={2020 30TH INTERNATIONAL CONFERENCE ON FIELD-PROGRAMMABLE LOGIC AND APPLICATIONS (FPL)}, author={Karabulut, Emre and Aysu, Aydin}, year={2020}, pages={26–32} } @article{aysu_2019, title={Teaching the Next Generation of Cryptographic Hardware Design to the Next Generation of Engineers}, ISSN={["1066-1395"]}, DOI={10.1145/3299874.3317994}, abstractNote={Evolving threats against cryptographic systems and the increasing diversity of computing platforms enforce teaching cryptographic engineering to a wider audience. This paper describes the development of a new graduate course on hardware security taught at North Carolina State University during Fall 2018. The course targets an audience with no background on cryptography or hardware vulnerabilities. The course focuses especially on post-quantum cryptosystems---the next-generation cryptosystems mitigating quantum computer attacks---and evolves into designing specialized hardware accelerators for post-quantum cryptography, executing sophisticated implementation attacks (e.g., side-channel and fault attacks), and building countermeasures on such hardware designs. We discuss the curriculum design, hands-on assignment's development, final research project outcome, and the results obtained from the course together with the associated challenges. Our experience shows that such a course is feasible, can achieve its goals, and liked by the students, but there is room for improvement.}, journal={GLSVLSI '19 - PROCEEDINGS OF THE 2019 ON GREAT LAKES SYMPOSIUM ON VLSI}, author={Aysu, Aydin}, year={2019}, pages={237–242} }