@article{shin_cox_oskin_loh_solihin_bhattacharjee_basu_2018, title={Scheduling Page Table Walks for Irregular GPU Applications}, ISSN={["1063-6897"]}, DOI={10.1109/ISCA.2018.00025}, abstractNote={Recent studies on commercial hardware demonstrated that irregular GPU applications can bottleneck on virtual-to-physical address translations. In this work, we explore ways to reduce address translation overheads for such applications. We discover that the order of servicing GPU's address translation requests (specifically, page table walks) plays a key role in determining the amount of translation overhead experienced by an application. We find that different SIMD instructions executed by an application require vastly different amounts of work to service their address translation needs, primarily depending upon the number of distinct pages they access. We show that better forward progress is achieved by prioritizing translation requests from the instructions that require less work to service their address translation needs. Further, in the GPU's Single-Instruction-Multiple-Thread (SIMT) execution paradigm, all threads that execute in lockstep (wavefront) need to finish operating on their respective data elements (and thus, finish their address translations) before the execution moves ahead. Thus, batching walk requests originating from the same SIMD instruction could reduce unnecessary stalls. We demonstrate that the reordering of translation requests based on the above principles improves the performance of several irregular GPU applications by 30% on average.}, journal={2018 ACM/IEEE 45TH ANNUAL INTERNATIONAL SYMPOSIUM ON COMPUTER ARCHITECTURE (ISCA)}, author={Shin, Seunghee and Cox, Guilherme and Oskin, Mark and Loh, Gabriel H. and Solihin, Yan and Bhattacharjee, Abhishek and Basu, Arkaprava}, year={2018}, pages={180–192} } @article{shin_kim_solihin_2016, title={Dense Footprint Cache: Capacity-Efficient Die-Stacked DRAM Last Level Cache}, DOI={10.1145/2989081.2989096}, abstractNote={Die-stacked DRAM technology enables a large Last Level Cache (LLC) that provides high bandwidth data access to the processor. However, it requires a large tag array that may take a significant portion of the on-chip SRAM budget. To reduce this SRAM overhead, systems like Intel Haswell relies on a large block (Mblock) size. One drawback of a large Mblock size is that many bytes of an Mblock are not needed by the processor but are fetched into the cache. A recent technique (Footprint cache) to solve this problem works by dividing the Mblock into smaller blocks where only blocks predicted to be needed by the processor are brought into the LLC. While it helps to alleviate the excessive bandwidth consumption from fetching unneeded blocks, the capacity waste remains: only blocks that are predicted useful are fetched and allocated, and the remaining area of the Mblock is left empty, creating holes. Unfortunately, holes create significant capacity overheads which could have been used for useful data, hence wasted refresh power on useless data. In this paper, we propose a new design, Dense Footprint Cache (DFC). Similar to Footprint cache, DFC uses a large Mblock and relies on useful block prediction in order to reduce memory bandwidth consumption. However, when blocks of an Mblock are fetched, the blocks are placed contiguously in the cache, thereby eliminating holes, increasing capacity and power efficiency, and increasing performance. Mblocks in DFC have variable sizes and a cache set has a variable associativity, hence it presents new challenges in designing its management policies (placement, replacement, and update). Through simulation of Big Data applications, we show that DFC reduces LLC miss ratios by about 43%, speeds up applications by 9.5%, while consuming 4.3% less energy on average.}, journal={MEMSYS 2016: PROCEEDINGS OF THE INTERNATIONAL SYMPOSIUM ON MEMORY SYSTEMS}, author={Shin, Seunghee and Kim, Sihong and Solihin, Yan}, year={2016}, pages={191–203} }