@article{karuppanan_mirbagher ajorpaz_2023, title={An Attack on The Speculative Vectorization: Leakage from Higher Dimensional Speculation}, url={https://arxiv.org/abs/2302.01131}, DOI={10.48550/ARXIV.2302.01131}, abstractNote={This paper argues and shows that speculative vectorization, where a loop with rare or unknown memory dependencies are still vectorized, is fundamentally vulnerable and cannot be mitigated by existing defenses. We implement a simple proof of concept and show the leakage in Apple M2 SoC. We describe the source of leakage using Microarchitectural Leakage Descriptors MLD and we additionally describe principles to extend MLD for other optimization. Also as part of implementation we reverse engineer the M2 cache size and use threaded timer to differentiate between cache hit and miss.}, publisher={arXiv}, author={Karuppanan, Sayinath and Mirbagher Ajorpaz, Samira}, year={2023} } @misc{chacon_garza_jimborean_ros_gratz_jimenez_mirbagher-ajorpaz_2022, title={Composite Instruction Prefetching}, ISSN={["1063-6404"]}, url={http://dx.doi.org/10.1109/iccd56317.2022.00076}, DOI={10.1109/iccd56317.2022.00076}, abstractNote={Prefetching is a pivotal mechanism for effectively masking latencies due to the processor/memory performance gap. Instruction prefetchers prevent costly instruction fetch stalls by requesting blocks of instruction memory in advance of their use to keep the pipeline front-end busy. the rapidly increasing instruction footprints of modern workloads have amplified the importance of such research.We propose a framework to leverage the complementary prefetching behaviors of existing prefetching techniques to create composite prefetchers. We show that recently proposed instruction prefetching techniques leverage different mechanisms from one another and find that in many cases, different prefetchers are complementary to each other. Composite prefetching allows for higher performance at lower storage overheads by combining the coverage of different complex prefetchers. We demonstrate a framework for selecting and combining state-of-the-art complex prefetchers, in a "plug-and-play" fashion, to identify the best performing combinations at various hardware overheads. We show that for every storage capacity constraint analyzed, composite prefetching outperforms prior prefetching schemes with greater improvements shown at smaller capacity constraints.}, journal={2022 IEEE 40th International Conference on Computer Design (ICCD)}, publisher={IEEE}, author={Chacon, Gino and Garza, Elba and Jimborean, Alexandra and Ros, Alberto and Gratz, Paul V. and Jimenez, Daniel A. and Mirbagher-Ajorpaz, Samira}, year={2022}, month={Oct}, pages={471–478} } @article{testa_mirbagher-ajorpaz_jimenez_2022, title={Dynamic Set Stealing to Improve Cache Performance}, ISSN={["1550-6533"]}, DOI={10.1109/SBAC-PAD55451.2022.00017}, abstractNote={In the last-level cache (LLC), replacement policy is dependent on workload characteristics. Adapting the policy to the current workload has been an active area of research. Previous works includes set dueling exemplified by DIP [40] which uses static replacement policies and machine learning based models such as Glider [45] or Multiperspective Reuse Prediction [19]. Both provide improvement over the least-recently-used (LRU) policy, but additional improvement is possible. DIP suffers from wasted resources as leading sets of the competing policies are fixed in size. Machine learning approaches each use a fixed set of features that were selected offline that are not optimal for all workloads. We introduce Set Stealing with Perceptron Tables (SSPT), a novel resource management policy that allows combining the strengths of many replacement policies to maximize performance and eliminate wasted processor resources. This policy achieves a 9.45% geometric mean speedup over a baseline LRU policy on a set of 81 SPEC benchmark work-loads, compared to Glider's 9.28% and Multiperspective Reuse Prediction at 7.62% assuming a 2 MB LLC. We achieve a 9.70% geometric mean speedup over LRU on a set of 90 big data (GAPS and XS) workloads, compared to Glider's 8.79% and Multiperspective Reuse Prediction at 7.86% assuming a 2 MB LLC.}, journal={2022 IEEE 34TH INTERNATIONAL SYMPOSIUM ON COMPUTER ARCHITECTURE AND HIGH PERFORMANCE COMPUTING (SBAC-PAD 2022)}, author={Testa, Brady and Mirbagher-Ajorpaz, Samira and Jimenez, Daniel A.}, year={2022}, pages={60–70} } @article{ajorpaz_moghimi_collins_pokam_abu-ghazaleh_tullsen_2022, title={EVAX: Towards a Practical, Pro-active & Adaptive Architecture for High Performance & Security}, ISSN={["1072-4451"]}, DOI={10.1109/MICRO56248.2022.00085}, abstractNote={This paper provides an end-to-end solution to defend against known microarchitectural attacks such as speculative execution attacks, fault-injection attacks, covert and side channel attacks, and unknown or evasive versions of these attacks. Current defenses are attack specific and can have unacceptably high performance overhead. We propose an approach that reduces the overhead of state-of-art defenses by over 95%, by applying defenses only when attacks are detected. Many current proposed mitigations are not practical for deployment; for example, InvisiSpec has 27% overhead and Fencing has 74% overhead while protecting against only Spectre attacks. Other mitigations carry similar performance penalties. We reduce the overhead for InvisiSpec to 1.26% and for Fencing to 3.45% offering performance and security for not only spectre attacks but other known transient attacks as well, including the dangerous class of LVI and Rowhammer attacks, as well as covering a large set of future evasive and zero-day attacks. Critical to our approach is an accurate detector that is not fooled by evasive attacks and that can generalize to novel zero-day attacks. We use a novel Generative framework, Evasion Vaccination (EVAX) for training ML models and engineering new security-centric performance counters. EVAX significantly increases sensitivity to detect and classify attacks in time for mitigation to be deployed with low false positives (4 FPs in every 1M instructions in our experiments). Such performance enables efficient and timely mitigations, enabling the processor to automatically switch between performance and security as needed.}, journal={2022 55TH ANNUAL IEEE/ACM INTERNATIONAL SYMPOSIUM ON MICROARCHITECTURE (MICRO)}, author={Ajorpaz, Samira Mirbagher and Moghimi, Daniel and Collins, Jeffrey Neal and Pokam, Gilles and Abu-Ghazaleh, Nael and Tullsen, Dean}, year={2022}, pages={1218–1236} } @misc{mirbagher-ajorpaz_garza_pokam_jimenez_2020, title={CHiRP: Control-Flow History Reuse Prediction}, url={http://dx.doi.org/10.1109/micro50266.2020.00023}, DOI={10.1109/micro50266.2020.00023}, abstractNote={Translation Lookaside Buffers (TLBs) play a critical role in hardware-supported memory virtualization. To speed up address translation and reduce costly page table walks, TLBs cache a small number of recently-used virtual-to-physical address translations. TLBs must make the best use of their limited capacities. Thus, TLB entries with low potential for reuse should be replaced by more useful entries. This paper contributes to an aspect of TLB management that has received little attention in the literature: replacement policy. We show how predictive replacement policies can be tailored toward TLBs to reduce miss rates and improve overall performance.We begin by applying recently proposed predictive cache replacement policies to the TLB. We show these policies do not work well without considering specific TLB behavior. Next, we introduce a novel TLB-focused predictive policy, Control-flow History Reuse Prediction (CHIRP). This policy uses a history signature and replacement algorithm that correlates to known TLB behavior, outperforming other policies.For a 1024-entry 8-way set-associative L2 TLB with a 4KB page size, we show that CHiRP reduces misses per 1000 instructions (MPKI) by an average 28.21% over the least-recently-used (LRU) policy, outperforming Static Re-reference Interval Prediction (SRRIP) [1], Global History Reuse Policy (GHRP) [2] and SHiP [3], which reduce MPKI by an average of 10.36%, 9.03% and 0.88%, respectively.}, journal={2020 53rd Annual IEEE/ACM International Symposium on Microarchitecture (MICRO)}, publisher={IEEE}, author={Mirbagher-Ajorpaz, Samira and Garza, Elba and Pokam, Gilles and Jimenez, Daniel A.}, year={2020}, month={Oct} } @misc{mirbagher-ajorpaz_pokam_mohammadian-koruyeh_garza_abu-ghazaleh_jimenez_2020, title={PerSpectron: Detecting Invariant Footprints of Microarchitectural Attacks with Perceptron}, url={http://dx.doi.org/10.1109/micro50266.2020.00093}, DOI={10.1109/micro50266.2020.00093}, abstractNote={Detecting microarchitectural attacks is critical given their proliferation in recent years. Many of these attacks exhibit intrinsic behaviors essential to the nature of their operation, such as creating contention or misspeculation. This study systematically investigates the microarchitectural footprints of hardware-based attacks and shows how they can be detected and classified using an efficient hardware predictor. We present a methodology to use correlated microarchitectural statistics to design a hardware-based neural predictor capable of detecting and classifying microarchitectural attacks before data is leaked. Once a potential attack is detected, it can be proactively mitigated by triggering appropriate countermeasures.Our hardware-based detector, PerSpectron, uses perceptron learning to identify and classify attacks. Perceptron-based prediction has been successfully used in branch prediction and other hardware-based applications. PerSpectron has minimal performance overhead. The statistics being monitored have similar overhead to already existing performance monitoring counters. Additionally, PerSpectron operates outside the processor’s critical paths, offering security without added computation delay. Our system achieves a usable detection rate for detecting attacks such as SpectreV1, SpectreV2, SpectreRSB, Meltdown, breakingKSLR, Flush+Flush, Flush+Reload, Prime+Probe as well as cache-attack calibration programs. We also believe that the large number of diverse microarchitectural features offers both evasion resilience and interpretability—features not present in previous hardware security detectors. We detect these attacks early enough to avoid any data leakage, unlike previous work that triggers countermeasures only after data has been exposed.}, journal={2020 53rd Annual IEEE/ACM International Symposium on Microarchitecture (MICRO)}, publisher={IEEE}, author={Mirbagher-Ajorpaz, Samira and Pokam, Gilles and Mohammadian-Koruyeh, Esmaeil and Garza, Elba and Abu-Ghazaleh, Nael and Jimenez, Daniel A.}, year={2020}, month={Oct} } @inproceedings{garza_mirbagher-ajorpaz_khan_jiménez_2019, title={Bit-level perceptron prediction for indirect branches}, url={http://dx.doi.org/10.1145/3307650.3322217}, DOI={10.1145/3307650.3322217}, abstractNote={Modern software uses indirect branches for various purposes including, but not limited to, virtual method dispatch and implementation of switch statements. Because an indirect branch's target address cannot be determined prior to execution, high-performance processors depend on highly-accurate indirect branch prediction techniques to mitigate control hazards. This paper proposes a new indirect branch prediction scheme that predicts target addresses at the bit level. Using a series of perceptron-based predictors, our predictor predicts individual branch target address bits based on correlations within branch history. Our evaluations show this new branch target predictor is competitive with state-of-the-art branch target predictors at an equivalent hardware budget. For instance, over a set of workloads including SPEC and mobile applications, our predictor achieves a misprediction rate of 0.183 mispredictions per 1000 instructions, compared with 0.193 for the state-of-the-art ITTAGE predictor and 0.29 for a VPC-based indirect predictor.}, booktitle={Proceedings of the 46th International Symposium on Computer Architecture}, publisher={ACM}, author={Garza, Elba and Mirbagher-Ajorpaz, Samira and Khan, Tahsin Ahmad and Jiménez, Daniel A.}, year={2019}, month={Jun} } @misc{mirbagher ajorpaz_garza_jindal_jimenez_2018, title={Exploring Predictive Replacement Policies for Instruction Cache and Branch Target Buffer}, url={http://dx.doi.org/10.1109/isca.2018.00050}, DOI={10.1109/isca.2018.00050}, abstractNote={Modern processors support instruction fetch with the instruction cache (I-cache) and branch target buffer (BTB). Due to timing and area constraints, the I-cache and BTB must efficiently make use of their limited capacities. Blocks in the I-cache or entries in the BTB that have low potential for reuse should be replaced by more useful blocks/entries. This work explores predictive replacement policies based on reuse prediction that can be applied to both the I-cache and BTB. Using a large suite of recently released industrial traces, we show that predictive replacement policies can reduce misses in the I-cache and BTB. We introduce Global History Reuse Prediction (GHRP), a replacement technique that uses the history of past instruction addresses and their reuse behaviors to predict dead blocks in the I-cache and dead entries in the BTB. This paper describes the effectiveness of GHRP as a dead block replacement and bypass optimization for both the I-cache and BTB. For a 64KB set-associative I-cache with a 64B block size, GHRP lowers the I-cache misses per 1000 instructions (MPKI) by an average of 18% over the least-recently-used (LRU) policy on a set of 662 industrial workloads, performing significantly better than Static Re-reference Interval Prediction (SRRIP) and Sampling Dead Block Prediction (SDBP). For a 4K-entry BTB, GHRP lowers MPKI by an average of 30% over LRU, 23% over SRRIP, and 29% over SDBP.}, journal={2018 ACM/IEEE 45th Annual International Symposium on Computer Architecture (ISCA)}, publisher={IEEE}, author={Mirbagher Ajorpaz, Samira and Garza, Elba and Jindal, Sangam and Jimenez, Daniel A.}, year={2018}, month={Jun} }