@article{karabulut_awad_aysu_2023, title={SS-AXI: Secure and Safe Access Control Mechanism for Multi-Tenant Cloud FPGAs}, ISSN={["0271-4302"]}, DOI={10.1109/ISCAS46773.2023.10181609}, abstractNote={FPGAs are newly added to the cloud to offer energy-efficient acceleration. Multi-tenancy is an emerging phenomenon in cloud FPGAs to enable resource efficiency. In a multi-tenant scenario, multiple users can share the same FPGA fabric either spatially (i.e., tenants share different resources at the same time) or temporally (tenants share the same resources in different time slots). Undesired access or manipulation of other tenant's data can cause security and safety issues. Although safety/security concepts in access control policies have been thoroughly studied in conventional cloud systems, they are relatively unknown for cloud FPGAs. Moreover, these concepts may not trivially extend to cloud FPGAs due to their different nature. This paper proposes an improved access control mechanism for multi-tenant cloud FPGAs. Compared to existing commercial tools, our solution allows dynamic configuration of access control privileges. Compared to earlier academic proposals with dynamic configuration, the results show that our proposal has three advantages: (i) enabling secure resource sharing of on-chip BRAMs to tenants, (ii) enabling safe sharing by resolving deadlocks and faulty access requests, and (iii) improvement in latency and throughput.}, journal={2023 IEEE INTERNATIONAL SYMPOSIUM ON CIRCUITS AND SYSTEMS, ISCAS}, author={Karabulut, Emre and Awad, Amro and Aysu, Aydin}, year={2023} } @article{mert_karabulut_ozturk_savas_aysu_2022, title={An Extensive Study of Flexible Design Methods for the Number Theoretic Transform}, volume={71}, ISSN={["1557-9956"]}, DOI={10.1109/TC.2020.3017930}, abstractNote={Efficient lattice-based cryptosystems operate with polynomial rings with the Number Theoretic Transform (NTT) to reduce the computational complexity of polynomial multiplication. NTT has therefore become a major arithmetic component (thus computational bottleneck) in various cryptographic constructions like hash functions, key-encapsulation mechanisms, digital signatures, and homomorphic encryption. Although there exist several hardware designs in prior work for NTT, they all are isolated design instances fixed for specific NTT parameters or parallelization level. This article provides an extensive study of flexible design methods for NTT implementation. To that end, we evaluate three cases: (1) parametric hardware design, (2) high-level synthesis (HLS) design approach, and (3) design for software implementation compiled on soft-core processors, where all are targeted on reconfigurable hardware devices. We evaluate the designs that implement multiple NTT parameters and/or processing elements, demonstrate the design details for each case, and provide a fair comparison with each other and prior work. On a Xilinx Virtex-7 FPGA, compared to HLS and processor-based methods, the results show that the parametric hardware design is on average $4.4\times$4.4× and $73.9\times$73.9× smaller and $22.5\times$22.5× and $19.3\times$19.3× faster, respectively. Surprisingly, HLS tools can yield less efficient solutions than processor-based approaches in some cases.}, number={11}, journal={IEEE TRANSACTIONS ON COMPUTERS}, author={Mert, Ahmet Can and Karabulut, Emre and Ozturk, Erdinc and Savas, Erkay and Aysu, Aydin}, year={2022}, month={Nov}, pages={2829–2843} } @article{dubey_karabulut_awad_aysu_2022, title={High-Fidelity Model Extraction Attacks via Remote Power Monitors}, DOI={10.1109/AICAS54282.2022.9869973}, abstractNote={This paper shows the first side-channel attack on neural network (NN) IPs through a remote power monitor. We demonstrate that a remote monitor implemented with time-to-digital converters can be exploited to steal the weights from a hardware implementation of NN inference. Such an attack alleviates the need to have physical access to the target device and thus expands the attack vector to multi-tenant cloud FPGA platforms. Our results quantify the effectiveness of the attack on an FPGA implementation of NN inference and compare it to an attack with physical access. We demonstrate that it is indeed possible to extract the weights using DPA with 25000 traces if the SNR is sufficient. The paper, therefore, motivates secure virtualization-to protect the confidentiality of high-valued NN model IPs in multi-tenant execution environments, platform developers need to employ strong countermeasures against physical side-channel attacks.}, journal={2022 IEEE INTERNATIONAL CONFERENCE ON ARTIFICIAL INTELLIGENCE CIRCUITS AND SYSTEMS (AICAS 2022): INTELLIGENT TECHNOLOGY IN THE POST-PANDEMIC ERA}, author={Dubey, Anuj and Karabulut, Emre and Awad, Amro and Aysu, Aydin}, year={2022}, pages={328–331} } @article{chen_karabulut_aysu_ma_jing_2021, title={An Efficient Non-Profiled Side-Channel Attack on the CRYSTALS-Dilithium Post-Quantum Signature}, ISSN={["1063-6404"]}, DOI={10.1109/ICCD53106.2021.00094}, abstractNote={Post-quantum digital signature is a critical primitive of computer security in the era of quantum hegemony. As a finalist of the post-quantum cryptography standardization process, the theoretical security of the CRYSTALS-Dilithium (Dilithium) signature scheme has been quantified to withstand classical and quantum cryptanalysis. However, there is an inherent power side-channel information leakage in its implementation instance due to the physical characteristics of hardware.This work proposes an efficient non-profiled Correlation Power Analysis (CPA) strategy on Dilithium to recover the secret key by targeting the underlying polynomial multiplication arithmetic. We first develop a conservative scheme with a reduced key guess space, which can extract a secret key coefficient with a 99.99% confidence using 157 power traces of the reference Dilithium implementation. However, this scheme suffers from the computational overhead caused by the large modulus in Dilithium signature. To further accelerate the CPA run-time, we propose a fast two-stage scheme that selects a smaller search space and then resolves false positives. We finally construct a hybrid scheme that combines the advantages of both schemes. Real-world experiment on the power measurement data shows that our hybrid scheme improves the attack’s execution time by 7.77×.}, journal={2021 IEEE 39TH INTERNATIONAL CONFERENCE ON COMPUTER DESIGN (ICCD 2021)}, author={Chen, Zhaohui and Karabulut, Emre and Aysu, Aydin and Ma, Yuan and Jing, Jiwu}, year={2021}, pages={583–590} } @article{karabulut_alkim_aysu_2021, title={Efficient, Flexible, and Constant-Time Gaussian Sampling Hardware for Lattice Cryptography}, volume={71}, ISSN={["1557-9956"]}, DOI={10.1109/TC.2021.3107729}, abstractNote={This paper proposes a discrete Gaussian sampling hardware design that can flexibly support different sampling parameters, that is more efficient (in area-delay product) compared to the majority of earlier proposals, and that has constant execution time. The proposed design implements a Cumulative Distribution Table (CDT) approach, reduces the table size with Gaussian convolutions, and adopts an innovative fusion tree search algorithm to achieve a compact and fast sampling technique—to our best knowledge, this is the first hardware implementation of fusion tree search algorithm. The proposed hardware can support all the discrete Gaussian distributions used in post-quantum digital signatures and key encapsulation algorithms (FALCON, qTESLA, and FrodoKEM), the homomorphic encryption library of SEAL, and other algorithms such BLISS digital signature and LP public-key encryption. Our proposed hardware can be configured at design-time to optimize a single configuration or at run-time to support multiple Gaussian distribution parameters. Our design, furthermore, has constant-time behavior by design, eliminating timing side-channel attacks—this is achieved by reading all table contents at the same time to also reduce the latency. The results on a Xilinx Virtex-7 FPGA show that our solution can outperform all prior proposals in area-delay product by 1.67–235.88×, only falling short to those designed for the LP encryption scheme.}, number={8}, journal={IEEE TRANSACTIONS ON COMPUTERS}, author={Karabulut, Emre and Alkim, Erdem and Aysu, Aydin}, year={2021}, month={Aug}, pages={1810–1823} } @article{karabulut_aysu_2021, title={FALCON Down: Breaking FALCON Post-Quantum Signature Scheme through Side-Channel Attacks}, ISSN={["0738-100X"]}, DOI={10.1109/DAC18074.2021.9586131}, abstractNote={This paper proposes the first side-channel attack on FALCON—a NIST Round-3 finalist for the post-quantum digital signature standard. We demonstrate a known-plaintext attack that uses the electromagnetic measurements of the device to extract the secret signing keys, which then can be used to forge signatures on arbitrary messages. The proposed attack targets the unique floating-point multiplications within FALCON’s Fast Fourier Transform through a novel extend-and-prune strategy that extracts the sign, mantissa, and exponent variables without false positives. The extracted floating-point values are then mapped back to the secret key’s coefficients. Our attack, notably, does not require pre-characterizing the power profile of the target device or crafting special inputs. Instead, the statistical differences on obtained traces are sufficient to successfully execute our proposed differential electromagnetic analysis. The results on an ARM-Cortex-M4 running the FALCON NIST’s reference software show that approximately 10k measurements are sufficient to extract the entire key.}, journal={2021 58TH ACM/IEEE DESIGN AUTOMATION CONFERENCE (DAC)}, author={Karabulut, Emre and Aysu, Aydin}, year={2021}, pages={691–696} } @article{karabulut_alkim_aysu_2021, title={Single-Trace Side-Channel Attacks on omega-Small Polynomial Sampling}, DOI={10.1109/HOST49136.2021.9702284}, abstractNote={This paper proposes a new single-trace side-channel attack on lattice-based post-quantum protocols. We target the ω-small polynomial sampling of NTRU, NTRU Prime, and CRYSTALS-DILITHIUM algorithm implementations (which are NIST Round-3 finalist and alternative candidates), and we demonstrate the vulnerabilities of their sub-routines to a power-based side-channel attack. Specifically, we reveal that the sorting implementation in NTRU/NTRU Prime and the shuffling in CRYSTALS-DILITHIUM's ω-small polynomial sampling process leaks information about the ‘-1’’0’, or ’+1' assignments made to the coefficients. We further demonstrate that these assignments can be found within a single power measurement and that revealing them allows secret and session key recovery for NTRU/NTRU Prime, while reducing the challenge polynomial's entropy for CRYSTALS-DILITHIUM. We execute our proposed attacks on an ARM Cortex-M4 microcontroller running the reference software submissions from NIST Round-3 software packages. The results show that our attacks can extract coefficients with a success rate of 99.78% for NTRU and NTRU Prime, reducing the search space to 241 or below. For CRYSTALS-DILITHIUM, our attack recovers the coefficients’ signs with over 99.99% success, reducing rejected challenge polynomials’ entropy between 39 to 60 bits. Our work informs the proposers about the single-trace vulnerabilities of their software and urges them to develop single-trace resilient software for low-cost microcontrollers.}, journal={2021 IEEE INTERNATIONAL SYMPOSIUM ON HARDWARE ORIENTED SECURITY AND TRUST (HOST)}, author={Karabulut, Emre and Alkim, Erdem and Aysu, Aydin}, year={2021}, pages={35–45} } @article{karabulut_aysu_2020, title={RANTT: A RISC-V Architecture Extension for the Number Theoretic Transform}, ISSN={["1946-1488"]}, DOI={10.1109/FPL50879.2020.00016}, abstractNote={Lattice-based cryptography has been growing in demand due to their quantum attack resiliency. Polynomial multiplication is a major computational bottleneck of lattice cryptosystems. To address the challenge, lattice-based cryptosystems use the Number Theoretic Transform (NTT). Although NTT reduces complexity, it is still a well-known computational bottleneck. At the same time, NTT arithmetic needs vary for different algorithms, motivating flexible solutions. Although there are prior hardware and software NTT designs, they do not simultaneously offer flexibility and efficiency. This work provides an efficient and flexible NTT solution through domain-specific architectural support on RISC-V. Rather than using instruction-set extensions with compiler modifications or loosely coupling a RISC-V core with an NTT co-processor, our proposal uses application-specific dynamic instruction scheduling, memory dependence prediction, and datapath optimizations. This allows achieving a direct translation of C code to optimized NTT executions. We demonstrate the flexibility of our approach by implementing the NTT used in several lattice-based cryptography protocols: NewHope, qTESLA, CRYSTALS-Kyber, CRYSTALS-Dilithium, and Falcon. The results on the FPGA technology show that the proposed design is respectively 6x, 40x, and 3x more efficient than the baseline solution, Berkeley Out-of-Order Machine, and a prior HW/SW co-design, while providing the needed flexibility.}, journal={2020 30TH INTERNATIONAL CONFERENCE ON FIELD-PROGRAMMABLE LOGIC AND APPLICATIONS (FPL)}, author={Karabulut, Emre and Aysu, Aydin}, year={2020}, pages={26–32} }