@article{yudha_meyer_yuan_zhou_solihin_2022, title={LITE: A Low-Cost Practical Inter-Operable GPU TEE}, DOI={10.1145/3524059.3532361}, abstractNote={There is a strong need for GPU trusted execution environments (TEEs) as GPU is increasingly used in the cloud environment. However, current proposals either ignore memory security (i.e., not encrypting memory) or impose a separate memory encryption domain from the host TEE, causing a very substantial slowdown for communicating data from/to the host. In this paper, we propose a flexible GPU memory encryption design called LITE that relies on software memory encryption aided by small architecture support. LITE's flexibility allows GPU TEE to be co-designed with CPU to create a unified encryption domain. We show that GPU applications can be adapted to the use of LITE encryption APIs without major changes. Through various optimizations, we show that software memory encryption in LITE can produce negligible performance overheads (1.1%) for regular benchmarks and still-acceptable overheads (56%) for irregular benchmarks.}, journal={PROCEEDINGS OF THE 36TH ACM INTERNATIONAL CONFERENCE ON SUPERCOMPUTING, ICS 2022}, author={Yudha, Ardhi Wiratama Baskara and Meyer, Jake and Yuan, Shougang and Zhou, Huiyang and Solihin, Yan}, year={2022} } @article{ul mustafa_xu_shen_solihin_2021, title={Seeds of SEED: New Security Challenges for Persistent Memory}, DOI={10.1109/SEED51797.2021.00020}, abstractNote={Persistent Memeory Object (PMO) is a general system abstraction for holding persistent data in persistent main memory, managed by an operating system. PMO programming model breaks inter-process isolation as it results in sharing of persistent data between two processes as they alternatively access the same PMO. The uncoordinated data-access opens a new avenue for cross-run and cross-process security attacks.In this paper, we discuss threat vulnerabilities that are either new or increased in intensity under PMO programming model. We also discuss security implications of using the PMO, highlighting sample PMO-based attacks and potential strategies to defend against them.}, journal={2021 INTERNATIONAL SYMPOSIUM ON SECURE AND PRIVATE EXECUTION ENVIRONMENT DESIGN (SEED 2021)}, author={Ul Mustafa, Naveed and Xu, Yuanchao and Shen, Xipeng and Solihin, Yan}, year={2021}, pages={83–88} } @article{xu_ye_solihin_shen_2020, title={Hardware-Based Domain Virtualization for Intra-Process Isolation of Persistent Memory Objects}, ISSN={["0884-7495"]}, DOI={10.1109/ISCA45697.2020.00062}, abstractNote={Persistent memory has appealing properties in serving as main memory. While file access is protected by system calls, an attached persistent memory object (PMO) is one load/store away from accidental (or malicious) reads or writes, which may arise from use of just one buggy library. The recent progress in intra-process isolation could potentially protect PMO by enabling a process to partition sensitive data and code into isolated components. However, the existing intra-process isolations (e.g., Intel MPK) support isolation of only up to 16 domains, forming a major barrier for PMO protections. Although there is some recent effort trying to virtualize MPK to circumvent the limit, it suffers large overhead. This paper presents two novel architecture supports, which provide 11 - 52 × higher efficiency while offering the first known domain-based protection for PMOs.}, journal={2020 ACM/IEEE 47TH ANNUAL INTERNATIONAL SYMPOSIUM ON COMPUTER ARCHITECTURE (ISCA 2020)}, author={Xu, Yuanchao and Ye, ChenCheng and Solihin, Yan and Shen, Xipeng}, year={2020}, pages={680–692} } @article{elkhouly_alshboul_hayashi_solihin_kimura_2019, title={Compiler-support for Critical Data Persistence in NVM}, volume={16}, ISSN={["1544-3973"]}, DOI={10.1145/3371236}, abstractNote={Non-volatile Main Memories (NVMs) offer a promising way to preserve data persistence and enable computation recovery in case of failure. While the use of NVMs can significantly reduce the overhead of failure recovery, which is the case with High-Performance Computing (HPC) kernels, rewriting existing programs or writing new applications for NVMs is non-trivial. In this article, we present a compiler-support that automatically inserts complex instructions into kernels to achieve NVM data-persistence based on a simple programmer directive. Unlike checkpointing techniques that store the whole system state, our technique only persists user-designated objects as well as some parameters required for safe recovery such as loop induction variables. Also, our technique can reduce the number of data transfer operations, because our compiler coalesces consecutive memory-persisting operations into a single memory transaction per cache line when possible. Our compiler-support is implemented in the LLVM tool-chain and introduces the necessary modifications to loop-intensive computational kernels (e.g., TMM, LU, Gauss, and FFT) to force data persistence. The experiments show that our proposed compiler-support outperforms the most recent checkpointing techniques while its performance overheads are insignificant.}, number={4}, journal={ACM TRANSACTIONS ON ARCHITECTURE AND CODE OPTIMIZATION}, author={Elkhouly, Reem and Alshboul, Mohammad and Hayashi, Akihiro and Solihin, Yan and Kimura, Keiji}, year={2019}, month={Dec} } @article{alshboul_elnawawy_elkhouly_kimura_tuck_solihin_2019, title={Efficient Checkpointing with Recompute Scheme for Non-volatile Main Memory}, volume={16}, ISSN={["1544-3973"]}, DOI={10.1145/3323091}, abstractNote={Future main memory will likely include Non-Volatile Memory. Non-Volatile Main Memory (NVMM) provides an opportunity to rethink checkpointing strategies for providing failure safety to applications. While there are many checkpointing and logging schemes in the literature, their use must be revisited as they incur high execution time overheads as well as a large number of additional writes to NVMM, which may significantly impact write endurance. In this article, we propose a novel recompute-based failure safety approach and demonstrate its applicability to loop-based code. Rather than keeping a fully consistent logging state, we only log enough state to enable recomputation. Upon a failure, our approach recovers to a consistent state by determining which parts of the computation were not completed and recomputing them. Effectively, our approach removes the need to keep checkpoints or logs, thus reducing execution time overheads and improving NVMM write endurance at the expense of more complex recovery. We compare our new approach against logging and checkpointing on five scientific workloads, including tiled matrix multiplication, on a computer system model that was built on gem5 and supports Intel PMEM instruction extensions. For tiled matrix multiplication, our recompute approach incurs an execution time overhead of only 5%, in contrast to 8% overhead with logging and 207% overhead with checkpointing. Furthermore, recompute only adds 7% additional NVMM writes, compared to 111% with logging and 330% with checkpointing. We also conduct experiments on real hardware, allowing us to run our workloads to completion while varying the number of threads used for computation. These experiments substantiate our simulation-based observations and provide a sensitivity study and performance comparison between the Recompute Scheme and Naive Checkpointing.}, number={2}, journal={ACM TRANSACTIONS ON ARCHITECTURE AND CODE OPTIMIZATION}, author={Alshboul, Mohammad and Elnawawy, Hussein and Elkhouly, Reem and Kimura, Keiji and Tuck, James and Solihin, Yan}, year={2019}, month={May} } @inproceedings{lin_alshboul_solihin_zhou_2019, title={Exploring Memory Persistency Models for GPUs}, ISSN={["1089-795X"]}, DOI={10.1109/PACT.2019.00032}, abstractNote={Given its high integration density, high speed, byte addressability, and low standby power, non-volatile or persistent memory is expected to supplement/replace DRAM as main memory. Through persistency programming model (which defines durability ordering of stores) and durable transaction constructs, the programmer can provide recoverable data structure (RDS) which allows programs to recover to a consistent state after a failure. While persistency models have been well studied for CPUs, they have been neglected for graphics processing units (GPUs). Considering the importance of GPUs as a dominant accelerator for high performance computing, we investigate persistency models for GPUs. GPU applications exhibit substantial differences with CPUs applications, hence in this paper we adapt, re-architect, and optimize CPU persistency models for GPUs. We design a pragma-based compiler scheme for expressing persistency model for GPUs. We identify that the thread hierarchy in GPUs offers intuitive scopes to form epochs and durable transactions. We find that undo logging produces significant performance overheads. We propose to use idempotency analysis to reduce both logging frequency and the size of logs. Through both real-system and simulation evaluations, we show low overheads of our proposed architecture support.}, booktitle={28th International Conference on Parallel Architectures and Compilation Techniques (PACT)}, author={Lin, Zhen and Alshboul, Mohammad and Solihin, Yan and Zhou, Huiyang}, year={2019}, pages={310–322} } @article{alshboul_tuck_solihin_2018, title={Lazy Persistency: a High-Performing and Write-Efficient Software Persistency Technique}, ISSN={["1063-6897"]}, DOI={10.1109/ISCA.2018.00044}, abstractNote={Emerging Non-Volatile Memories (NVMs) are expected to be included in future main memory, providing the opportunity to host important data persistently in main memory. However, achieving persistency requires that programs be written with failure-safety in mind. Many persistency models and techniques have been proposed to help the programmer reason about failure-safety. They require that the programmer eagerly flush data out of caches to make it persistent. Eager persistency comes with a large overhead because it adds many instructions to the program for flushing cache lines and incurs costly stalls at barriers to wait for data to become durable. To reduce these overheads, we propose Lazy Persistency (LP), a software persistency technique that allows caches to slowly send dirty blocks to the NVMM through natural evictions. With LP, there are no additional writes to NVMM, no decrease in write endurance, and no performance degradation from cache line flushes and barriers. Persistency failures are discovered using software error detection (checksum), and the system recovers from them by recomputing inconsistent results. We describe the properties and design of LP and demonstrate how it can be applied to loop-based kernels popularly used in scientific computing. We evaluate LP and compare it to the state-of-the-art Eager Persistency technique from prior work. Compared to it, LP reduces the execution time and write amplification overheads from 9% and 21% to only 1% and 3%, respectively.}, journal={2018 ACM/IEEE 45TH ANNUAL INTERNATIONAL SYMPOSIUM ON COMPUTER ARCHITECTURE (ISCA)}, author={Alshboul, Mohammad and Tuck, James and Solihin, Yan}, year={2018}, pages={439–451} } @article{shin_cox_oskin_loh_solihin_bhattacharjee_basu_2018, title={Scheduling Page Table Walks for Irregular GPU Applications}, ISSN={["1063-6897"]}, DOI={10.1109/ISCA.2018.00025}, abstractNote={Recent studies on commercial hardware demonstrated that irregular GPU applications can bottleneck on virtual-to-physical address translations. In this work, we explore ways to reduce address translation overheads for such applications. We discover that the order of servicing GPU's address translation requests (specifically, page table walks) plays a key role in determining the amount of translation overhead experienced by an application. We find that different SIMD instructions executed by an application require vastly different amounts of work to service their address translation needs, primarily depending upon the number of distinct pages they access. We show that better forward progress is achieved by prioritizing translation requests from the instructions that require less work to service their address translation needs. Further, in the GPU's Single-Instruction-Multiple-Thread (SIMT) execution paradigm, all threads that execute in lockstep (wavefront) need to finish operating on their respective data elements (and thus, finish their address translations) before the execution moves ahead. Thus, batching walk requests originating from the same SIMD instruction could reduce unnecessary stalls. We demonstrate that the reordering of translation requests based on the above principles improves the performance of several irregular GPU applications by 30% on average.}, journal={2018 ACM/IEEE 45TH ANNUAL INTERNATIONAL SYMPOSIUM ON COMPUTER ARCHITECTURE (ISCA)}, author={Shin, Seunghee and Cox, Guilherme and Oskin, Mark and Loh, Gabriel H. and Solihin, Yan and Bhattacharjee, Abhishek and Basu, Arkaprava}, year={2018}, pages={180–192} } @inproceedings{wang_awad_solihin_2017, title={Clone morphing: Creating new workload behavior from existing applications}, DOI={10.1109/ispass.2017.7975274}, abstractNote={Computer system designers need a deep understanding of end users' workload in order to arrive at an optimum design. However, current design practices suffer from two problems: time mismatch where designers rely on workloads available today to design systems that will be produced years into the future to run future workloads, and sparse behavior where many performance behavior is not represented by the limited set of applications available today. We propose clone morphing, a systematic method for producing new synthetic workloads (morphs) with performance behavior that does not currently exist. The morphs are generated automatically without knowing or changing the original application's source code. There are three different aspects a morph can differ from the original benchmark it is built on: temporal locality, spatial locality, and memory footprint. We showed how each of these aspects can be varied largely independently of other aspects. Furthermore, we also presented a method for merging two different applications into one that has an average behavior of both applications. We evaluated the morphs by running them on simulators and collect statistics that capture their behavior, and validated that morphs can be used for projecting future workloads and for generating new behavior that fills up the behavior map densely.}, booktitle={Ieee international symposium on performance analysis of systems and}, author={Wang, Y. P. and Awad, A. and Solihin, Y.}, year={2017}, pages={97–107} } @article{shin_tuck_solihin_2017, title={Hiding the Long Latency of Persist Barriers Using Speculative Execution}, DOI={10.1145/3079856.3080240}, abstractNote={Byte-addressable non-volatile memory technology is emerging as an alternative for DRAM for main memory. This new Non-Volatile Main Memory (NVMM) allows programmers to store important data in data structures in memory instead of serializing it to the file system, thereby providing a substantial performance boost. However, modern systems reorder memory operations and utilize volatile caches for better performance, making it difficult to ensure a consistent state in NVMM. Intel recently announced a new set of persistence instructions, clflushopt, clwb, and pcommit. These new instructions make it possible to implement fail-safe code on NVMM, but few workloads have been written or characterized using these new instructions. In this work, we describe how these instructions work and how they can be used to implement write-ahead logging based transactions. We implement several common data structures and kernels and evaluate the performance overhead incurred over traditional non-persistent implementations. In particular, we find that persistence instructions occur in clusters along with expensive fence operations, they have long latency, and they add a significant execution time overhead, on average by 20.3% over code with logging but without fence instructions to order persists. To deal with this overhead and alleviate the performance bottleneck, we propose to speculate past long latency persistency operations using checkpoint-based processing. Our speculative persistence architecture reduces the execution time overheads to only 3.6%.}, journal={44TH ANNUAL INTERNATIONAL SYMPOSIUM ON COMPUTER ARCHITECTURE (ISCA 2017)}, author={Shin, Seunghee and Tuck, James and Solihin, Yan}, year={2017}, pages={175–186} } @article{awad_wang_shands_solihin_2017, title={ObfusMem: A Low-Overhead Access Obfuscation for Trusted Memories}, DOI={10.1145/3079856.3080230}, abstractNote={Trustworthy software requires strong privacy and security guarantees from a secure trust base in hardware. While chipmakers provide hardware support for basic security and privacy primitives such as enclaves and memory encryption. these primitives do not address hiding of the memory access pattern, information about which may enable attacks on the system or reveal characteristics of sensitive user data. State-of-the-art approaches to protecting the access pattern are largely based on Oblivious RAM (ORAM). Unfortunately, current ORAM implementations suffer from very significant practicality and overhead concerns, including roughly an order of magnitude slowdown, more than 100% memory capacity overheads, and the potential for system deadlock. Memory technology trends are moving towards 3D and 2.5D integration, enabling significant logic capabilities and sophisticated memory interfaces. Leveraging the trends, we propose a new approach to access pattern obfuscation, called ObfusMem. ObfusMem adds the memory to the trusted computing base and incorporates cryptographic engines within the memory. ObfusMem encrypts commands and addresses on the memory bus, hence the access pattern is cryptographically obfuscated from external observers. Our evaluation shows that ObfusMem incurs an overhead of 10.9% on average, which is about an order of magnitude faster than ORAM implementations. Furthermore, ObfusMem does not incur capacity overheads and does not amplify writes. We analyze and compare the security protections provided by ObfusMem and ORAM, and highlight their differences.}, journal={44TH ANNUAL INTERNATIONAL SYMPOSIUM ON COMPUTER ARCHITECTURE (ISCA 2017)}, author={Awad, Amro and Wang, Yipeng and Shands, Deborah and Solihin, Yan}, year={2017}, pages={107–119} } @article{shin_kim_solihin_2016, title={Dense Footprint Cache: Capacity-Efficient Die-Stacked DRAM Last Level Cache}, DOI={10.1145/2989081.2989096}, abstractNote={Die-stacked DRAM technology enables a large Last Level Cache (LLC) that provides high bandwidth data access to the processor. However, it requires a large tag array that may take a significant portion of the on-chip SRAM budget. To reduce this SRAM overhead, systems like Intel Haswell relies on a large block (Mblock) size. One drawback of a large Mblock size is that many bytes of an Mblock are not needed by the processor but are fetched into the cache. A recent technique (Footprint cache) to solve this problem works by dividing the Mblock into smaller blocks where only blocks predicted to be needed by the processor are brought into the LLC. While it helps to alleviate the excessive bandwidth consumption from fetching unneeded blocks, the capacity waste remains: only blocks that are predicted useful are fetched and allocated, and the remaining area of the Mblock is left empty, creating holes. Unfortunately, holes create significant capacity overheads which could have been used for useful data, hence wasted refresh power on useless data. In this paper, we propose a new design, Dense Footprint Cache (DFC). Similar to Footprint cache, DFC uses a large Mblock and relies on useful block prediction in order to reduce memory bandwidth consumption. However, when blocks of an Mblock are fetched, the blocks are placed contiguously in the cache, thereby eliminating holes, increasing capacity and power efficiency, and increasing performance. Mblocks in DFC have variable sizes and a cache set has a variable associativity, hence it presents new challenges in designing its management policies (placement, replacement, and update). Through simulation of Big Data applications, we show that DFC reduces LLC miss ratios by about 43%, speeds up applications by 9.5%, while consuming 4.3% less energy on average.}, journal={MEMSYS 2016: PROCEEDINGS OF THE INTERNATIONAL SYMPOSIUM ON MEMORY SYSTEMS}, author={Shin, Seunghee and Kim, Sihong and Solihin, Yan}, year={2016}, pages={191–203} } @article{li_li_lin_li_2017, title={Significant and sustaining elevation of blood oxygen induced by Chinese cupping therapy as assessed by near-infrared spectroscopy}, volume={8}, ISSN={["2156-7085"]}, DOI={10.1364/boe.8.000223}, abstractNote={Cupping therapy has been used in traditional Chinese medicine for thousands of years to relieve muscle pain/tendency/fatigue and to cure or reduce symbols of other diseases. However, its therapeutic effect is sparsely interpreted in the language of modern physiology. To objectively evaluate its therapeutic effect, we focused on dry cupping treatment and utilized near-infrared spectroscopy (NIRS) to assess the concentration change in oxy-hemoglobin ([HbO2]), deoxy-hemoglobin ([Hb]), and blood volume in the course of cupping therapy over 13 volunteers on the infraspinatus muscle, where is usually applied for shoulder pains. Both a prominent drop in [Hb] and a significant elevation in [HbO2] in the tissue surrounding the cupping site were observed during both cupping and post-treatment, manifesting the enhancement of oxygen uptake. This resulting promotion indicates potential positive therapeutic effect of cupping therapy in hemodynamics for facilitating muscular functions.}, number={1}, journal={BIOMEDICAL OPTICS EXPRESS}, author={Li, Ting and Li, Yaoxian and Lin, Yu and Li, Kai}, year={2017}, month={Jan}, pages={223–229} } @article{awad_manadhata_haber_solihin_horne_2016, title={Silent Shredder: Zero-Cost Shredding for Secure Non-Volatile Main Memory Controllers}, volume={51}, ISSN={["1558-1160"]}, DOI={10.1145/2954679.2872377}, abstractNote={As non-volatile memory (NVM) technologies are expected to replace DRAM in the near future, new challenges have emerged. For example, NVMs have slow and power-consuming writes, and limited write endurance. In addition, NVMs have a data remanence vulnerability, i.e., they retain data for a long time after being powered off. NVM encryption alleviates the vulnerability, but exacerbates the limited endurance by increasing the number of writes to memory. We observe that, in current systems, a large percentage of main memory writes result from data shredding in operating systems, a process of zeroing out physical pages before mapping them to new processes, in order to protect previous processes' data. In this paper, we propose Silent Shredder, which repurposes initialization vectors used in standard counter mode encryption to completely eliminate the data shredding writes. Silent Shredder also speeds up reading shredded cache lines, and hence reduces power consumption and improves overall performance. To evaluate our design, we run three PowerGraph applications and 26 multi-programmed workloads from the SPEC 2006 suite, on a gem5-based full system simulator. Silent Shredder eliminates an average of 48.6% of the writes in the initialization and graph construction phases. It speeds up main memory reads by 3.3 times, and improves the number of instructions per cycle (IPC) by 6.4% on average. Finally, we discuss several use cases, including virtual machines' data isolation and user-level large data initialization, where Silent Shredder can be used effectively at no extra cost.}, number={4}, journal={ACM SIGPLAN NOTICES}, author={Awad, Amro and Manadhata, Pratyusa and Haber, Stuart and Solihin, Yan and Horne, William}, year={2016}, month={Apr}, pages={263–276} } @inproceedings{awad_manadhata_haber_solihin_horne_2016, title={Silent shredder: Zero-cost shredding for secure non-volatile main memory controllers}, volume={50}, DOI={10.1145/2954680.2872377}, abstractNote={As non-volatile memory (NVM) technologies are expected to replace DRAM in the near future, new challenges have emerged. For example, NVMs have slow and power-consuming writes, and limited write endurance. In addition, NVMs have a data remanence vulnerability, i.e., they retain data for a long time after being powered off. NVM encryption alleviates the vulnerability, but exacerbates the limited endurance by increasing the number of writes to memory. We observe that, in current systems, a large percentage of main memory writes result from data shredding in operating systems, a process of zeroing out physical pages before mapping them to new processes, in order to protect previous processes' data. In this paper, we propose Silent Shredder, which repurposes initialization vectors used in standard counter mode encryption to completely eliminate the data shredding writes. Silent Shredder also speeds up reading shredded cache lines, and hence reduces power consumption and improves overall performance. To evaluate our design, we run three PowerGraph applications and 26 multi-programmed workloads from the SPEC 2006 suite, on a gem5-based full system simulator. Silent Shredder eliminates an average of 48.6% of the writes in the initialization and graph construction phases. It speeds up main memory reads by 3.3 times, and improves the number of instructions per cycle (IPC) by 6.4% on average. Finally, we discuss several use cases, including virtual machines' data isolation and user-level large data initialization, where Silent Shredder can be used effectively at no extra cost.}, number={2}, booktitle={Operating Systems Review}, author={Awad, A. and Manadhata, P. and Haber, S. and Solihin, Y. and Horne, W.}, year={2016}, pages={263–276} } @inproceedings{wang_solihin_2015, title={Emulating cache organizations on real hardware using performance cloning}, DOI={10.1109/ispass.2015.7095815}, abstractNote={Computer system designers need a deep understanding of end users' workload in order to arrive at an optimum design. Unfortunately, many end users will not share their software to designers due to the proprietary or confidential nature of their software. Researchers have proposed workload cloning, which is a process of extracting statistics that summarize the behavior of users' workloads through profiling, followed by using them to drive the generation of a representative synthetic workload (clone). Clones can be used in place of the original workloads to evaluate computer system performance, helping designers to understand the behavior of users workload on the simulated machine models without the users having to disclose proprietary or sensitive information about the original workload. In this paper, we propose infusing environment-specific information into the clone. This Environment-Specific Clone (ESC) enables the simulation of hypothetical cache configurations directly on a machine with a different cache configuration. We validate ESC on both real systems as well as cache simulations. Furthermore, we present a case study of how page mapping affects cache performance. ESC enables such a study at native machine speed by infusing the page mapping information into clones, without needing to modify the OS or hardware. We then analyze the factors that determine how page mapping impact cache performance, and how various applications are affected differently.}, booktitle={Ieee international symposium on performance analysis of systems and}, author={Wang, Y. P. and Solihin, Y.}, year={2015}, pages={298–307} } @article{wang_solihin_balakrishnan_2015, title={MeToo: Stochastic Modeling of Memory Traffic Timing Behavior}, ISSN={["1089-795X"]}, DOI={10.1109/pact.2015.36}, abstractNote={The memory subsystem (memory controller, bus, andDRAM) is becoming a bottleneck in computer system performance. Optimizing the design of the multicore memory subsystem requires good understanding of the representative workload. A common practice in designing the memory subsystem is to rely on trace simulation. However, the conventional method of relying on traditional traces faces two major challenges. First, many software users are apprehensive about sharing their code (source or binaries) due to the proprietary nature of the code or secrecy of data, so representative traces are sometimes not available. Second, there is a feedback loop where memory performance affects processor performance, which in turnalters the timing of memory requests that reach the bus. Such feedback loop is difficult to capture with traces. In this paper, we present MeToo, a framework for generating synthetic memory traffic for memory subsystem design exploration. MeToo uses a small set of statistics that summarizes the performance behavior of the original applications, and generates synthetic traces or executables stochastically, allowing applications to remain proprietary. MeToo uses novel methods for mimicking the memory feedback loop. We validate MeToo clones, and show very good fit with the original applications' behavior, with an average error of only 4.2%, which is a small fraction of the errors obtained using geometric inter-arrival(commonly used in queueing models) and uniform inter-arrival.}, journal={2015 INTERNATIONAL CONFERENCE ON PARALLEL ARCHITECTURE AND COMPILATION (PACT)}, author={Wang, Yipeng and Solihin, Yan and Balakrishnan, Ganesh}, year={2015}, pages={457–467} } @inproceedings{awad_kettering_solihin_2015, title={Non-volatile memory host controller interface performance analysis in high-performance I/O systems}, DOI={10.1109/ispass.2015.7095793}, abstractNote={Emerging non-volatile memories (NVMs), such as Phase-Change Memory (PCM), Spin-Transfer Torque RAM (STT-RAM) and Memristor, are very promising candidates for replacing NAND-Flash Solid-State Drives (SSDs) and Hard Disk Drives (HDDs) for many reasons. First, their read/write latencies are orders of magnitude faster. Second, some emerging NVMs, such as memristors, are expected to have very high densities, which allow deploying a much higher capacity without requiring increased physical space. While the percentage of the time taken for data movement over low-speed buses, such as Peripheral Component Interconnect (PCI), is negligible for the overall read/write latency in HDDs, it could be dominant for emerging fast NVMs. Therefore, the trend has moved toward using very fast interconnect technologies, such as PCI Express (PCIe) which is hundreds of times faster than the traditional PCI. Accordingly, new host controller interfaces are used to communicate with I/O devices to exploit the parallelism and low-latency features of emerging NVMs through high-speed interconnects. In this paper, we investigate the system performance bottlenecks and overhead of using the standard state-of-the-art Non-Volatile Memory Express (NVMe), or Non-Volatile Memory Host Controller Interface (NVMHCI) Specification [1] as representative for NVM host controller interfaces.}, booktitle={Ieee international symposium on performance analysis of systems and}, author={Awad, A. and Kettering, B. and Solihin, Y.}, year={2015}, pages={145–154} } @inbook{samih_wang_maciocco_kharbutli_solihin_2014, title={Collaborative Memories in Clusters: Opportunities and Challenges}, ISBN={9783642542114 9783642542121}, ISSN={0302-9743 1611-3349}, url={http://dx.doi.org/10.1007/978-3-642-54212-1_2}, DOI={10.1007/978-3-642-54212-1_2}, abstractNote={Highly-integrated distributed systems such as Intel Micro Server and SeaMicro Server are increasingly becoming a popular server architecture. Designers of such systems face interesting memory hierarchy design challenges while attempting to reduce/eliminate the notorious disk storage swapping. Disk swapping activities slow down applications' execution drastically. Swapping to the free remote memory - near by nodes, through Memory Collaboration has demonstrated its cost-effectiveness compared to overprovisioning memory for peak load requirements. Recent studies propose several ways to access the under-utilized remote memory in static system configurations, without detailed exploration of dynamic memory collaboration. Dynamic collaboration is an important aspect given the run-time memory usage fluctuations in clustered systems. Furthermore, with the growing interest in memory collaboration, it is crucial to understand the existing performance bottlenecks, overheads, and potential optimizations. In this paper we address these two issues. First, we propose an Autonomous Collaborative Memory System (ACMS) that manages memory resources dynamically at run time, to optimize performance, and provide QoS measures for nodes engaging in the system. We implement a prototype realizing the proposed ACMS, experiment with a wide range of real-world applications, and show up to 3x performance speedup compared to a non-collaborative memory system, without perceivable performance impact on nodes that provide memory. Second, we analyze, in depth, the end-to-end memory collaboration overhead and bottlenecks. Based on this analysis, we provide insights on several corresponding optimizations to further improve the performance.}, booktitle={Transactions on Computational Science XXII}, publisher={Springer Berlin Heidelberg}, author={Samih, Ahmad and Wang, Ren and Maciocco, Christian and Kharbutli, Mazen and Solihin, Yan}, year={2014}, pages={17–41} } @inproceedings{awad_solihin_2014, title={STM : Cloning the spatial and temporal memory access behavior}, DOI={10.1109/hpca.2014.6835935}, abstractNote={Computer architects need a deep understanding of clients' workload in order to design and tune the architecture. Unfortunately, many important clients will not share their software to computer architects due to the proprietary or confidential nature of their software. One technique to mitigate this problem is producing synthetic traces (clone) that replicate the behavior of the original workloads. Unfortunately, today there is no universal cloning technique that can capture arbitrary memory access behavior of applications. Existing technique captures only temporal, but not spatial, locality. In order to study memory hierarchy organization beyond caches, such as including prefetchers and translation lookaside buffer (TLB), capturing only temporal locality is insufficient. In this paper, we propose a new memory access behavior cloning technique that captures both temporal and spatial locality. We abbreviate our scheme as Spatio-Temporal Memory (STM) cloning. We propose a new profiling method and statistics that capture stride patterns and transition probabilities. We show how the new statistics enable accurate clone generation that allow clones to be used in place of the original benchmarks for studying the L1/L2/TLB miss rates as we vary the L1 cache, L1 prefetcher, L2 cache, TLB, and page size configurations.}, booktitle={International symposium on high-performance computer}, author={Awad, A. and Solihin, Y.}, year={2014}, pages={237–247} } @article{tiwari_solihin_2012, title={Modeling and Analyzing Key Performance Factors of Shared Memory MapReduce}, ISSN={["1530-2075"]}, DOI={10.1109/ipdps.2012.119}, abstractNote={MapReduce parallel programming model has seen wide adoption in data center applications. Recently, lightweight, fast, in-memory MapReduce runtime systems have been proposed for shared memory systems. However, what factors affect performance and what performance bottlenecks exist for a given program, are not well understood. This paper builds an analytical model to capture key performance factors of shared memory MapReduce and investigates important performance trends and behavior. Our study discovers several important findings and implications for system designers, performance tuners, and programmers. Our model quantifies relative contribution of different key performance factors for both map and reduce phases, and shows that performance of MapReduce programs are highly input-content dependent. Our model reveals that performance is heavily affected by the order in which distinct keys are encountered during the Map phase, and the frequency of these distinct keys. Our model points out cases in which reduce phase time dominates the total execution time. We also show that data-structure and algorithm design choices affect map and reduce phases differently and sometimes affecting map phase positively while affecting reduce phase negatively. Finally, we propose an application classification framework that can be used to reason about performance bottlenecks for a given application.}, journal={2012 IEEE 26TH INTERNATIONAL PARALLEL AND DISTRIBUTED PROCESSING SYMPOSIUM (IPDPS)}, author={Tiwari, Devesh and Solihin, Yan}, year={2012}, pages={1306–1317} } @inproceedings{balakrishnan_solihin_2012, title={WEST: Cloning data cache behavior using stochastic traces}, DOI={10.1109/hpca.2012.6169042}, abstractNote={Cache designers need an in-depth understanding of end user workloads, but certain end users are apprehensive about sharing code or traces due to the proprietary or confidential nature of code and data. To bridge this gap, cache designers use a reduced representation of the code (a clone). A promising cloning approach is the black box approach, where workloads are profiled to obtain key statistics, and a clone is automatically generated. Despite its potential, currently there are no highly accurate black box cloning methods for replicating data cache behavior. We propose Workload Emulation using Stochastic Traces (WEST), a highly accurate black box cloning technique for replicating data cache behavior of arbitrary programs. First, we analyze what profiling statistics are necessary and sufficient to capture a workload. Then, we generate a clone stochastically that produces statistics identical to the proprietary workload. WEST clones can be used in lieu of the workload for exploring cache sizes, associativities, write policies, replacement policies, cache hierarchies and co-scheduling, at a significantly reduced simulation time. We use a simple IPC model to control the rate of accesses to the cache hierarchy. We evaluated WEST using CPU2006 and BioBench suites over a wide cache design space for single core and dual core CMPs. The clones achieve an average error in miss ratio of only 0.4% across 1394 single core cache configurations. For co-scheduled mixes, WEST achieves an average error in miss ratio of only 3.1% for over 600 configurations.}, booktitle={International symposium on high-performance computer}, author={Balakrishnan, G. and Solihin, Y.}, year={2012}, pages={387–398} } @inproceedings{jiang_solihin_2011, title={Architectural framework for supporting operating system survivability}, DOI={10.1109/hpca.2011.5749751}, abstractNote={The ever increasing size and complexity of Operating System (OS) kernel code bring an inevitable increase in the number of security vulnerabilities that can be exploited by attackers. A successful security attack on the kernel has a profound impact that may affect all processes running on it. In this paper we propose an architectural framework that provides survivability to the OS kernel, i.e. able to keep normal system operation despite security faults. It consists of three components that work together: (1) security attack detection, (2) security fault isolation, and (3) a recovery mechanism that resumes normal system operation. Through simple but carefully-designed architecture support, we provide OS kernel survivability with low performance overheads (< 5% for kernel intensive benchmarks). When tested with real world security attacks, our survivability mechanism automatically prevents the security faults from corrupting the kernel state or affecting other processes, recovers the kernel state and resumes execution.}, booktitle={International symposium on high-performance computer}, author={Jiang, X. W. and Solihin, Y.}, year={2011}, pages={456–465} } @article{samih_solihin_krishna_2011, title={Evaluating Placement Policies for Managing Capacity Sharing in CMP Architectures with Private Caches}, volume={8}, ISSN={["1544-3973"]}, DOI={10.1145/2019608.2019614}, abstractNote={Chip Multiprocessors (CMP) with distributed L2 caches suffer from a cache fragmentation problem; some caches may be overutilized while others may be underutilized. To avoid such fragmentation, researchers have proposed capacity sharing mechanisms where applications that need additional cache space can place their victim blocks in remote caches. However, we found that only allowing victim blocks to be placed on remote caches tends to cause a high number of remote cache hits relative to local cache hits. In this article, we show that many of the remote cache hits can be converted into local cache hits if we allow newly fetched blocks to be selectively placed directly in a remote cache, rather than in the local cache. To demonstrate this, we use future trace information to estimate the near-upperbound performance that can be gained from combined placement and replacement decisions in capacity sharing. Motivated by encouraging experimental results, we design a simple, predictor-based, scheme called Adaptive Placement Policy (APP) that learns from past cache behavior to make a better decision on whether to place a newly fetched block in the local or remote cache. We found that across 50 multiprogrammed workload mixes running on a 4-core CMP, APP's capacity sharing mechanism increases aggregate performance by 29% on average. At the same time, APP outperforms the state-of-the-art capacity sharing mechanism that uses only replacement-based decisions by up to 18.2%, with a maximum degradation of only 0.5%, and an average improvement of 3%. }, number={3}, journal={ACM TRANSACTIONS ON ARCHITECTURE AND CODE OPTIMIZATION}, author={Samih, Ahmad and Solihin, Yan and Krishna, Anil}, year={2011}, month={Oct} } @inproceedings{lee_tiwari_yan_tuck_2011, title={HAQu: Hardware-accelerated queueing for fine-grained threading on a chip multiprocessor}, DOI={10.1109/hpca.2011.5749720}, abstractNote={Queues are commonly used in multithreaded programs for synchronization and communication. However, because software queues tend to be too expensive to support finegrained parallelism, hardware queues have been proposed to reduce overhead of communication between cores. Hardware queues require modifications to the processor core and need a custom interconnect. They also pose difficulties for the operating system because their state must be preserved across context switches. To solve these problems, we propose a hardware-accelerated queue, or HAQu. HAQu adds hardware to a CMP that accelerates operations on software queues. Our design implements fast queueing through an application's address space with operations that are compatible with a fully software queue. Our design provides accelerated and OS-transparent performance in three general ways: (1) it provides a single instruction for enqueueing and dequeueing which significantly reduces the overhead when used in fine-grained threading; (2) operations on the queue are designed to leverage low-level details of the coherence protocol; and (3) hardware ensures that the full state of the queue is stored in the application's address space, thereby ensuring virtualization. We have evaluated our design in the context of application domains: offloading fine-grained checks for improved software reliability, and automatic, fine-grained parallelization using decoupled software pipelining.}, booktitle={International symposium on high-performance computer}, author={Lee, S. and Tiwari, D. and Yan, S. H. and Tuck, J.}, year={2011}, pages={99–110} } @inbook{chhabra_solihin_lal_hoekstra_2010, title={An Analysis of Secure Processor Architectures}, ISBN={9783642113888 9783642113895}, ISSN={0302-9743 1611-3349}, url={http://dx.doi.org/10.1007/978-3-642-11389-5_6}, DOI={10.1007/978-3-642-11389-5_6}, abstractNote={Security continues to be an increasingly important concern in the design of modern systems. Many systems may have security requirements such as protecting the integrity and confidentiality of data and code stored in the system, ensuring integrity of computations, or preventing the execution of unauthorized code. Making security guarantees has become even harder with the emergence of hardware attacks where the attacker has physical access to the system and can bypass any software security mechanisms employed. To this end, researchers have proposed Secure Processor architectures that provide protection against hardware attacks using platform features. In this paper, we analyze three of the currently proposed secure uniprocessor designs in terms of their security, complexity of hardware required and performance overheads: eXecute Only Memory (XOM), Counter mode encryption and Merkle tree based authentication, and Address Independent Seed Encryption and Bonsai Merkle Tree based authentication. We then provide a discussion on the issues in securing multiprocessor systems and survey one design each for Shared Memory Multiprocessors and Distributed Shared Memory Multiprocessors. Finally, we discuss future directions in Secure Processor research which have largely been ignored forming the weakest link in the security afforded by the proposed schemes, namely, Secure booting and Secure configuration. We identify potential issues which can serve to form the foundation of further research in secure processors.}, booktitle={Transactions on Computational Science VII}, publisher={Springer Berlin Heidelberg}, author={Chhabra, Siddhartha and Solihin, Yan and Lal, Reshma and Hoekstra, Matthew}, year={2010}, pages={101–121} } @inproceedings{jiang_madan_zhao_upton_iyer_makineni_newell_solihin_balasubramonian_2010, title={CHOP: Adaptive filter-based DRAM caching for CMP server platforms}, DOI={10.1109/hpca.2010.5416642}, abstractNote={As manycore architectures enable a large number of cores on the die, a key challenge that emerges is the availability of memory bandwidth with conventional DRAM solutions. To address this challenge, integration of large DRAM caches that provide as much as 5× higher bandwidth and as low as 1/3rd of the latency (as compared to conventional DRAM) is very promising. However, organizing and implementing a large DRAM cache is challenging because of two primary tradeoffs: (a) DRAM caches at cache line granularity require too large an on-chip tag area that makes it undesirable and (b) DRAM caches with larger page granularity require too much bandwidth because the miss rate does not reduce enough to overcome the bandwidth increase. In this paper, we propose CHOP (Caching HOt Pages) in DRAM caches to address these challenges. We study several filter-based DRAM caching techniques: (a) a filter cache (CHOP-FC) that profiles pages and determines the hot subset of pages to allocate into the DRAM cache, (b) a memory-based filter cache (CHOP-MFC) that spills and fills filter state to improve the accuracy and reduce the size of the filter cache and (c) an adaptive DRAM caching technique (CHOP-AFC) to determine when the filter cache should be enabled and disabled for DRAM caching. We conduct detailed simulations with server workloads to show that our filter-based DRAM caching techniques achieve the following: (a) on average over 30% performance improvement over previous solutions, (b) several magnitudes lower area overhead in tag space required for cache-line based DRAM caches, (c) significantly lower memory bandwidth consumption as compared to page-granular DRAM caches.}, booktitle={International symposium on high-performance computer}, author={Jiang, X. W. and Madan, N. and Zhao, L. and Upton, M. and Iyer, R. and Makineni, S. and Newell, D. and Solihin, Y. and Balasubramonian, R.}, year={2010}, pages={233–244} } @article{jiang_madan_zhao_upton_iyer_makineni_newell_solihin_balasubramonian_2011, title={CHOP: INTEGRATING DRAM CACHES FOR CMP SERVER PLATFORMS}, volume={31}, ISSN={["1937-4143"]}, DOI={10.1109/mm.2010.100}, abstractNote={Integrating large DRAM caches is a promising way to address the memory bandwidth wall issue in the many-core era. However, organizing and implementing a large DRAM cache imposes a trade-off between tag space overhead and memory bandwidth consumption. CHOP (Caching Hot Pages) addresses this trade-off through three filter-based DRAM-caching techniques.}, number={1}, journal={IEEE MICRO}, author={Jiang, Xiaowei and Madan, Niti and Zhao, Li and Upton, Mike and Iyer, Ravi and Makineni, Srihari and Newell, Donald and Solihin, Yan and Balasubramonian, Rajeev}, year={2011}, pages={99–108} } @inbook{chhabra_solihin_2010, title={Green Secure Processors: Towards Power-Efficient Secure Processor Design}, ISBN={9783642174988 9783642174995}, ISSN={0302-9743 1866-4741}, url={http://dx.doi.org/10.1007/978-3-642-17499-5_13}, DOI={10.1007/978-3-642-17499-5_13}, abstractNote={With the increasing wealth of digital information stored on computer systems today, security issues have become increasingly important. In addition to attacks targeting the software stack of a system, hardware attacks have become equally likely. Researchers have proposed Secure Processor Architectures which utilize hardware mechanisms for memory encryption and integrity verification to protect the confidentiality and integrity of data and computation, even from sophisticated hardware attacks. While there have been many works addressing performance and other system level issues in secure processor design, power issues have largely been ignored. In this paper, we first analyze the sources of power (energy) increase in different secure processor architectures. We then present a power analysis of various secure processor architectures in terms of their increase in power consumption over a base system with no protection and then provide recommendations for designs that offer the best balance between performance and power without compromising security. We extend our study to the embedded domain as well. We also outline the design of a novel hybrid cryptographic engine that can be used to minimize the power consumption for a secure processor. We believe that if secure processors are to be adopted in future systems (general purpose or embedded), it is critically important that power issues are considered in addition to performance and other system level issues. To the best of our knowledge, this is the first work to examine the power implications of providing hardware mechanisms for security.}, booktitle={Transactions on Computational Science X}, publisher={Springer Berlin Heidelberg}, author={Chhabra, Siddhartha and Solihin, Yan}, year={2010}, pages={329–351} } @inproceedings{tiwari_tuck_solihin_2010, title={MMT: Exploiting Fine Grained Parallelism in Dynamic Memory Management}, DOI={10.1109/ipdps.2010.5470428}, abstractNote={Dynamic memory management is one of the most expensive but ubiquitous operations in many operations in many C/C++ applications. Additional features such as security checks, while desirable, further worsen memory management overheads. With advent of multicore architecture, it is important to investigate how dynamic memory management overheads for sequential applications can be reduced. In this paper, we propose a new approach for accelerating dynamic memory management on multicore architecture, by offloading dynamic management functions to a separate thread that we refer to as memory management thread (MMT). We show that an efficient MMT design can give significant performance improvement by extracting parallelism while being agnostic to the underlying memory management library algorithms and data structures. We also show how parallelism provided by MMT can be beneficial for high overhead memory management tasks, for example, security checks related to memory management. We evaluate MMT on heap allocation-intensive benchmarks running on an Intel core 2 quad platform for two widely-used memory allocators: Doug Lea's and PHKmalloc allocators. On average, MMT achieves a speedup ratio of 1.19× for both allocators, while both the application and memory management libraries are unmodified and are oblivious to the parallelization scheme. For PHKmalloc with security checks turned on, MMT reduces the security check overheads from 21% to only 1% on average.}, booktitle={International Parallel and Distributed Processing Symposium}, author={Tiwari, D. and Tuck, J. and Solihin, Y.}, year={2010} } @article{guo_solihin_zhao_iyer_2010, title={Quality of Service Shared Cache Management in Chip Multiprocessor Architecture}, volume={7}, ISSN={["1544-3973"]}, DOI={10.1145/1880037.1880039}, abstractNote={ The trends in enterprise IT toward service-oriented computing, server consolidation, and virtual computing point to a future in which workloads are becoming increasingly diverse in terms of performance, reliability, and availability requirements. It can be expected that more and more applications with diverse requirements will run on a Chip Multi-Processor (CMP) and share platform resources such as the lowest level cache and off-chip bandwidth. In this environment, it is desirable to have microarchitecture and software support that can provide a guarantee of a certain level of performance, which we refer to as performance Quality of Service . In this article, we investigated a framework would be needed to manage the shared cache resource for fully providing QoS in a CMP. We found in order to fully provide QoS, we need to specify an appropriate QoS target for each job and apply an admission control policy to accept jobs only when their QoS targets can be satisfied. We also found that providing strict QoS often leads to a significant reduction in throughput due to resource fragmentation. We proposed throughput optimization techniques that include: (1) exploiting various QoS execution modes, and (2) a microarchitecture technique, which we refer to as resource stealing, that detects and reallocates excess cache capacity from a job while preserving its QoS target. We designed and evaluated three algorithms for performing resource stealing, which differ in how aggressive they are in stealing excess cache capacity, and in the degree of confidence in meeting QoS targets. In addition, we proposed a mechanism to dynamically enable or disable resource stealing depending on whether other jobs can benefit from additional cache capacity. We evaluated our QoS framework with a full system simulation of a 4-core CMP and a recent version of the Linux Operating System. We found that compared to an unoptimized scheme, the throughput can be improved by up to 47%, making the throughput significantly closer to a non-QoS CMP.}, number={3}, journal={ACM TRANSACTIONS ON ARCHITECTURE AND CODE OPTIMIZATION}, author={Guo, Fei and Solihin, Yan and Zhao, Li and Iyer, Ravishankar}, year={2010}, month={Dec} } @inproceedings{liu_jiang_solihin_2010, title={Understanding how off-chip memory bandwidth partitioning in chip multiprocessors affects system performance}, DOI={10.1109/hpca.2010.5416655}, abstractNote={Chip Multi-Processor (CMP) architectures have recently become a mainstream computing platform. Recent CMPs allow cores to share expensive resources, such as the last level cache and off-chip pin bandwidth. To improve system performance and reduce the performance volatility of individual threads, last level cache and off-chip bandwidth partitioning schemes have been proposed. While how cache partitioning affects system performance is well understood, little is understood regarding how bandwidth partitioning affects system performance, and how bandwidth and cache partitioning interact with one another. In this paper, we propose a simple yet powerful analytical model that gives us an ability to answer several important questions: (1) How does off-chip bandwidth partitioning improve system performance? (2) In what situations the performance improvement is high or low, and what factors determine that? (3) In what way cache and bandwidth partitioning interact, and is the interaction negative or positive? (4) Can a theoretically optimum bandwidth partition be derived, and if so, what factors affect it? We believe understanding the answers to these questions is very valuable to CMP system designers in coming up with strategies to deal with the scarcity of off-chip bandwidth in future CMPs with many cores on a chip.}, booktitle={International symposium on high-performance computer}, author={Liu, F. and Jiang, X. W. and Solihin, Y.}, year={2010}, pages={57–68} } @article{liu_solihin_2010, title={Understanding the Behavior and Implications of Context Switch Misses}, volume={7}, ISSN={["1544-3973"]}, DOI={10.1145/1880043.1880048}, abstractNote={One of the essential features in modern computer systems is context switching, which allows multiple threads of execution to time-share a limited number of processors. While very useful, context switching can introduce high performance overheads, with one of the primary reasons being the cache perturbation effect. Between the time a thread is switched out and when it resumes execution, parts of its working set in the cache may be perturbed by other interfering threads, leading to (context switch) cache misses to recover from the perturbation. The goal of this article is to understand how cache parameters and application behavior influence the number of context switch misses the application suffers from. We characterize a previously unreported type of context switch misses that occur as the artifact of the interaction of cache replacement policy and an application's temporal reuse behavior. We characterize the behavior of these “reordered misses” for various applications, cache sizes, and various amount of cache perturbation. As a second contribution, we develop an analytical model that reveals the mathematical relationship between cache design parameters, an application's temporal reuse pattern, and the number of context switch misses the application suffers from. We validate the model against simulation studies and find that it is sufficiently accurate in predicting the trends of context switch misses with regard to various cache perturbation amount. The mathematical relationship provided by the model allows us to derive insights into precisely why some applications are more vulnerable to context switch misses than others. Through a case study on prefetching, we find that prefetching tends to aggravate the number of context switch misses and a less aggresive prefetching technique can reduce the number of context switch misses the application suffers from. We also investigate how cache sizes affect context switch misses. Our study shows that under relatively heavy workloads in the system, the worst-case number of context switch misses an application suffers from tends to increase proportionally with cache sizes, to the extent that may completely negate the reduction in other types of cache misses.}, number={4}, journal={ACM TRANSACTIONS ON ARCHITECTURE AND CODE OPTIMIZATION}, author={Liu, Fang and Solihin, Yan}, year={2010}, month={Dec} } @article{jiang_solihin_zhao_iyer_2009, title={Architecture Support for Improving Bulk Memory Copying and Initialization Performance}, ISBN={["978-0-7695-3771-9"]}, ISSN={["1089-795X"]}, DOI={10.1109/pact.2009.31}, abstractNote={Bulk memory copying and initialization is one of the most ubiquitous operations performed in current computer systems by both user applications and Operating Systems. While many current systems rely on a loop of loads and stores, there are proposals to introduce a single instruction to perform bulk memory copying. While such an instruction can improve performance due to generating fewer TLB and cache accesses, and requiring fewer pipeline resources, in this paper we show that the key to significantly improving the performance is removing pipeline and cache bottlenecks of the code that follows the instructions. We show that the bottlenecks arise due to (1) the pipeline clogged by the copying instruction, (2) lengthened critical path due to dependent instructions stalling while waiting for the copying to complete, and (3) the inability to specify (separately) the cacheability of the source and destination regions. We propose FastBCI, an architecture support that achieves the granularity efficiency of a bulk copying/ initialization instruction, but without its pipeline and cache bottlenecks. When applied to OS kernel buffer management, we show that on average FastBCI achieves anywhere between 23% to 32% speedup ratios, which is roughly 3x-4x of an alternative scheme, and 1.5x-2x of a highly optimistic DMA with zero setup and interrupt overheads.}, journal={18TH INTERNATIONAL CONFERENCE ON PARALLEL ARCHITECTURES AND COMPILATION TECHNIQUES, PROCEEDINGS}, author={Jiang, Xiaowei and Solihin, Yan and Zhao, Li and Iyer, Ravishankar}, year={2009}, pages={169-+} } @book{solihin_2009, title={Fundamentals of parallel computer architecture multichip and multicore systems}, publisher={[United States?]: Solihin Pub}, author={Solihin, Yan}, year={2009} } @article{chhabra_rogers_solihin_prvulovic_2008, title={Making Secure Processors OS- and Performance-Friendly}, volume={5}, ISSN={["1544-3973"]}, DOI={10.1145/1498690.1498691}, abstractNote={In today's digital world, computer security issues have become increasingly important. In particular, researchers have proposed designs for secure processors that utilize hardware-based memory encryption and integrity verification to protect the privacy and integrity of computation even from sophisticated physical attacks. However, currently proposed schemes remain hampered by problems that make them impractical for use in today's computer systems: lack of virtual memory and Inter-Process Communication support as well as excessive storage and performance overheads. In this article, we propose (1) address independent seed encryption (AISE), a counter-mode-based memory encryption scheme using a novel seed composition, and (2) bonsai Merkle trees (BMT), a novel Merkle tree-based memory integrity verification technique, to eliminate these system and performance issues associated with prior counter-mode memory encryption and Merkle tree integrity verification schemes. We present both a qualitative discussion and a quantitative analysis to illustrate the advantages of our techniques over previously proposed approaches in terms of complexity, feasibility, performance, and storage. Our results show that AISE+BMT reduces the overhead of prior memory encryption and integrity verification schemes from 12% to 2% on average for single-threaded benchmarks on uniprocessor systems, and from 15% to 4% for coscheduled benchmarks on multicore systems while eliminating critical system-level problems.}, number={4}, journal={ACM TRANSACTIONS ON ARCHITECTURE AND CODE OPTIMIZATION}, author={Chhabra, Siddhartha and Rogers, Brian and Solihin, Yan and Prvulovic, Milos}, year={2008} } @article{venkataramani_doudalis_solihin_prvulovic_2009, title={MemTracker: An Accelerator for Memory Debugging and Monitoring}, volume={6}, ISSN={["1544-3973"]}, DOI={10.1145/1543753.1543754}, abstractNote={Memory bugs are a broad class of bugs that is becoming increasingly common with increasing software complexity, and many of these bugs are also security vulnerabilities. Existing software and hardware approaches for finding and identifying memory bugs have a number of drawbacks including considerable performance overheads, target only a specific type of bug, implementation cost, and inefficient use of computational resources. This article describes MemTracker, a new hardware support mechanism that can be configured to perform different kinds of memory access monitoring tasks. MemTracker associates each word of data in memory with a few bits of state, and uses a programmable state transition table to react to different events that can affect this state. The number of state bits per word, the events to which MemTracker reacts, and the transition table are all fully programmable. MemTracker's rich set of states, events, and transitions can be used to implement different monitoring and debugging checkers with minimal performance overheads, even when frequent state updates are needed. To evaluate MemTracker, we map three different checkers onto it, as well as a checker that combines all three. For the most demanding (combined) checker with 8 bits state per memory word, we observe performance overheads of only around 3%, on average, and 14.5% worst-case across different benchmark suites. Such low overheads allow continuous (always-on) use of MemTracker-enabled checkers, even in production runs.}, number={2}, journal={ACM TRANSACTIONS ON ARCHITECTURE AND CODE OPTIMIZATION}, author={Venkataramani, Guru and Doudalis, Ioannis and Solihin, Yan and Prvulovic, Milos}, year={2009}, month={Jun} } @inproceedings{tiwari_lee_tuck_solihin_2009, title={Memory Management Thread for Heap Allocation Intensive Applications}, DOI={10.1145/1621960.1621967}, abstractNote={Dynamic memory management is one of the most ubiquitous and expensive operations in many C/C++ applications. Some C/C++ programs might spend up to one third of their execution time in dynamic memory management routines. With multicore processors as a mainstream architecture, it is important to investigate how dynamic memory management can exploit the multi-core parallelism for speeding up sequential programs. In this paper, we propose a way for exploiting multicore parallelism in dynamic memory management for sequential applications, by spinning off memory allocation and deallocation functions to a separate thread that we refer to as memory management thread (MMT). The goal of this study is to show how an efficient design and implementation of MMT can give performance without any algorithm or implementation level knowledge of underlying memory management library being offloaded. Using heap allocation-intensive benchmarks, we evaluate MMT on an Intel Core 2 Quad platform for widely used Doug Lea's memory allocator. Without any modifications to application source-code or memory management algorithm of underlying memory allocators, our MMT approach achieves an average speedup ratio of 1.19x, and 1.60x in the best case.}, booktitle={Workshop on Memory Performance: Dealing with Applications, Systems Architecture}, author={Tiwari, D. and Lee, S. and Tuck, J. and Solihin, Y.}, year={2009} } @article{kharbutli_solihin_2008, title={Counter-based cache replacement and bypassing algorithms}, volume={57}, ISSN={["0018-9340"]}, DOI={10.1109/TC.2007.70816}, abstractNote={Recent studies have shown that, in highly associative caches, the performance gap between the least recently used (LRU) and the theoretical optimal replacement algorithms is large, motivating the design of alternative replacement algorithms to improve cache performance. In LRU replacement, a line, after its last use, remains in the cache for a long time until it becomes the LRU line. Such deadlines unnecessarily reduce the cache capacity available for other lines. In addition, in multilevel caches, temporal reuse patterns are often inverted, showing in the L1 cache but, due to the filtering effect of the L1 cache, not showing in the L2 cache. At the L2, these lines appear to be brought in the cache but are never reaccessed until they are replaced. These lines unnecessarily pollute the L2 cache. This paper proposes a new counter-based approach to deal with the above problems. For the former problem, we predict lines that have become dead and replace them early from the L2 cache. For the latter problem, we identify never-reaccessed lines, bypass the L2 cache, and place them directly in the L1 cache. Both techniques are achieved through a single counter-based mechanism. In our approach, each line in the L2 cache is augmented with an event counter that is incremented when an event of interest such as certain cache accesses occurs. When the counter reaches a threshold, the line ";expires"; and becomes replaceable. Each line's threshold is unique and is dynamically learned. We propose and evaluate two new replacement algorithms: Access interval predictor (AIP) and live-time predictor (LvP). AIP and LvP speed up 10 capacity-constrained SPEC2000 benchmarks by up to 48 percent and 15 percent on average (7 percent on average for the whole 21 Spec2000 benchmarks). Cache bypassing further reduces L2 cache pollution and improves the average speedups to 17 percent (8 percent for the whole 21 Spec2000 benchmarks).}, number={4}, journal={IEEE TRANSACTIONS ON COMPUTERS}, author={Kharbutli, Mazen and Solihin, Yan}, year={2008}, month={Apr}, pages={433–447} } @book{solihin_2008, title={Fundamentals of parallel computer architecture}, publisher={[United States?]: Solihin Pub}, author={Solihin, Yan}, year={2008} } @article{lee_jung_lim_solihin_2009, title={Prefetching with Helper Threads for Loosely Coupled Multiprocessor Systems}, volume={20}, ISSN={["1558-2183"]}, DOI={10.1109/TPDS.2008.224}, abstractNote={This paper presents a helper thread prefetching scheme that is designed to work on loosely coupled processors, such as in a standard chip multiprocessor (CMP) system or an intelligent memory system. Loosely coupled processors have an advantage in that resources such as processor and L1 cache resources are not contended by the application and helper threads, hence preserving the speed of the application. However, interprocessor communication is expensive in such a system. We present techniques to alleviate this. Our approach exploits large loop-based code regions and is based on a new synchronization mechanism between the application and helper threads. This mechanism precisely controls how far ahead the execution of the helper thread can be with respect to the application thread. We found that this is important in ensuring prefetching timeliness and avoiding cache pollution. To demonstrate that prefetching in a loosely coupled system can be done effectively, we evaluate our prefetching by simulating a standard unmodified CMP system and an intelligent memory system where a simple processor in memory executes the helper thread. Evaluating our scheme with nine memory-intensive applications with the memory processor in DRAM achieves an average speedup of 1.25. Moreover, our scheme works well in combination with a conventional processor-side sequential L1 prefetcher, resulting in an average speedup of 1.31. In a standard CMP, the scheme achieves an average speedup of 1.33. Using a real CMP system with a shared L2 cache between two cores, our helper thread prefetching plus hardware L2 prefetching achieves an average speedup of 1.15 over the hardware L2 prefetching for the subset of applications with high L2 cache misses per cycle.}, number={9}, journal={IEEE TRANSACTIONS ON PARALLEL AND DISTRIBUTED SYSTEMS}, author={Lee, Jaejin and Jung, Changhee and Lim, Daeseob and Solihin, Yan}, year={2009}, month={Sep}, pages={1309–1324} } @article{shetty_kharbutli_solihin_prvulovic_2006, title={HeapMon: A helper-thread approach to programmable, automatic, and low-overhead memory bug detection}, volume={50}, ISSN={["2151-8556"]}, DOI={10.1147/rd.502.0261}, abstractNote={The ability to detect and pinpoint memory-related bugs in production runs is important because in-house testing may miss bugs. This paper presents HeapMon, a heap memory bug-detection scheme that has a very low performance overhead, is automatic, and is easy to deploy. HeapMon relies on two new techniques. First, it decouples application execution from bug monitoring, which executes as a helper thread on a separate core in a chip multiprocessor system. Second, it associates a filter bit with each cached word to safely and significantly reduce bug checking frequency--by 95% on average. We test the effectiveness of these techniques using existing and injected memory bugs in SPEC®2000 applications and show that HeapMon effectively detects and identifies most forms of heap memory bugs. Our results also indicate that the HeapMon performance overhead is only 5%, on average--orders of magnitude less than existing tools. Its overhead is also modest: 3.1% of the cache size and a 32-KB victim cache for on-chip filter bits and 6.2% of the allocated heap memory size for state bits, which are maintained by the helper thread as a software data structure.}, number={2-3}, journal={IBM JOURNAL OF RESEARCH AND DEVELOPMENT}, author={Shetty, R and Kharbutli, M and Solihin, Y and Prvulovic, M}, year={2006}, pages={261–275} } @article{kharbutli_solihin_lee_2005, title={Eliminating conflict misses using prime number-based cache indexing}, volume={54}, ISSN={["1557-9956"]}, DOI={10.1109/TC.2005.79}, abstractNote={Using alternative cache indexing/hashing functions is a popular technique to reduce conflict misses by achieving a more uniform cache access distribution across the sets in the cache. Although various alternative hashing functions have been demonstrated to eliminate the worst-case conflict behavior, no study has really analyzed the pathological behavior of such hashing functions that often results in performance slowdown. We present an in-depth analysis of the pathological behavior of cache hashing functions. Based on the analysis, we propose two new hashing functions, prime modulo and odd-multiplier displacement, that are resistant to pathological behavior and yet are able to eliminate the worst-case conflict behavior in the L2 cache. We show that these two schemes can be implemented in fast hardware using a set of narrow addition operations, with negligible fragmentation in the L2 cache. We evaluate the schemes on 23 memory intensive applications. For applications that have nonuniform cache accesses, both prime modulo and odd-multiplier displacement hashing achieve an average speedup of 1.27 compared to traditional hashing, without slowing down any of the 23 benchmarks. We also evaluate using odd-multiplier displacement function with multiple multipliers in conjunction with a skewed associative L2 cache. The skewed associative cache achieves a better average speedup at the cost of some pathological behavior that slows down four applications by up to 7 percent.}, number={5}, journal={IEEE TRANSACTIONS ON COMPUTERS}, author={Kharbutli, M and Solihin, Y and Lee, J}, year={2005}, month={May}, pages={573–586} } @article{solihin_lee_torrellas_2003, title={Correlation prefetching with a user-level memory thread}, volume={14}, DOI={10.1109/tpds.2003.1206504}, abstractNote={This paper proposes using a user-level memory thread (ULMT) for correlation prefetching. In this approach, a user thread runs on a general-purpose processor in main memory, either in the memory controller chip or in a DRAM chip. The thread performs correlation prefetching in software, sending the prefetched data into the L2 cache of the main processor. This approach requires minimal hardware beyond the memory processor: The correlation table is a software data structure that resides in main memory, while the main processor only needs a few modifications to its L2 cache so that it can accept incoming prefetches. In addition, the approach has wide applicability, as it can effectively prefetch even for irregular applications. Finally, it is very flexible, as the prefetching algorithm can be customized by the user on an application basis. Our simulation results show that, through a new design of the correlation table and prefetching algorithm, our scheme delivers good results. Specifically, nine mostly-irregular applications show an average speedup of 1.32. Furthermore, our scheme works well in combination with a conventional processor-side sequential prefetcher, in which case the average speedup increases to 1.46. Finally, by exploiting the customization of the prefetching algorithm, we increase the average speedup to 1.53.}, number={6}, journal={IEEE Transactions on Parallel and Distributed Systems}, author={Solihin, Y. and Lee, J. and Torrellas, J.}, year={2003}, pages={563–580} }