@inproceedings{sheikh_kharbutli_2010, title={Improving cache performance by combining cost-sensitivity and locality principles in cache replacement algorithms}, booktitle={2010 ieee international conference on computer design}, author={Sheikh, R. and Kharbutli, M.}, year={2010}, pages={76–83} } @article{kharbutli_solihin_2008, title={Counter-based cache replacement and bypassing algorithms}, volume={57}, ISSN={["0018-9340"]}, DOI={10.1109/TC.2007.70816}, abstractNote={Recent studies have shown that, in highly associative caches, the performance gap between the least recently used (LRU) and the theoretical optimal replacement algorithms is large, motivating the design of alternative replacement algorithms to improve cache performance. In LRU replacement, a line, after its last use, remains in the cache for a long time until it becomes the LRU line. Such deadlines unnecessarily reduce the cache capacity available for other lines. In addition, in multilevel caches, temporal reuse patterns are often inverted, showing in the L1 cache but, due to the filtering effect of the L1 cache, not showing in the L2 cache. At the L2, these lines appear to be brought in the cache but are never reaccessed until they are replaced. These lines unnecessarily pollute the L2 cache. This paper proposes a new counter-based approach to deal with the above problems. For the former problem, we predict lines that have become dead and replace them early from the L2 cache. For the latter problem, we identify never-reaccessed lines, bypass the L2 cache, and place them directly in the L1 cache. Both techniques are achieved through a single counter-based mechanism. In our approach, each line in the L2 cache is augmented with an event counter that is incremented when an event of interest such as certain cache accesses occurs. When the counter reaches a threshold, the line ";expires"; and becomes replaceable. Each line's threshold is unique and is dynamically learned. We propose and evaluate two new replacement algorithms: Access interval predictor (AIP) and live-time predictor (LvP). AIP and LvP speed up 10 capacity-constrained SPEC2000 benchmarks by up to 48 percent and 15 percent on average (7 percent on average for the whole 21 Spec2000 benchmarks). Cache bypassing further reduces L2 cache pollution and improves the average speedups to 17 percent (8 percent for the whole 21 Spec2000 benchmarks).}, number={4}, journal={IEEE TRANSACTIONS ON COMPUTERS}, author={Kharbutli, Mazen and Solihin, Yan}, year={2008}, month={Apr}, pages={433–447} } @article{shetty_kharbutli_solihin_prvulovic_2006, title={HeapMon: A helper-thread approach to programmable, automatic, and low-overhead memory bug detection}, volume={50}, ISSN={["2151-8556"]}, DOI={10.1147/rd.502.0261}, abstractNote={The ability to detect and pinpoint memory-related bugs in production runs is important because in-house testing may miss bugs. This paper presents HeapMon, a heap memory bug-detection scheme that has a very low performance overhead, is automatic, and is easy to deploy. HeapMon relies on two new techniques. First, it decouples application execution from bug monitoring, which executes as a helper thread on a separate core in a chip multiprocessor system. Second, it associates a filter bit with each cached word to safely and significantly reduce bug checking frequency--by 95% on average. We test the effectiveness of these techniques using existing and injected memory bugs in SPEC®2000 applications and show that HeapMon effectively detects and identifies most forms of heap memory bugs. Our results also indicate that the HeapMon performance overhead is only 5%, on average--orders of magnitude less than existing tools. Its overhead is also modest: 3.1% of the cache size and a 32-KB victim cache for on-chip filter bits and 6.2% of the allocated heap memory size for state bits, which are maintained by the helper thread as a software data structure.}, number={2-3}, journal={IBM JOURNAL OF RESEARCH AND DEVELOPMENT}, author={Shetty, R and Kharbutli, M and Solihin, Y and Prvulovic, M}, year={2006}, pages={261–275} } @article{kharbutli_solihin_lee_2005, title={Eliminating conflict misses using prime number-based cache indexing}, volume={54}, ISSN={["1557-9956"]}, DOI={10.1109/TC.2005.79}, abstractNote={Using alternative cache indexing/hashing functions is a popular technique to reduce conflict misses by achieving a more uniform cache access distribution across the sets in the cache. Although various alternative hashing functions have been demonstrated to eliminate the worst-case conflict behavior, no study has really analyzed the pathological behavior of such hashing functions that often results in performance slowdown. We present an in-depth analysis of the pathological behavior of cache hashing functions. Based on the analysis, we propose two new hashing functions, prime modulo and odd-multiplier displacement, that are resistant to pathological behavior and yet are able to eliminate the worst-case conflict behavior in the L2 cache. We show that these two schemes can be implemented in fast hardware using a set of narrow addition operations, with negligible fragmentation in the L2 cache. We evaluate the schemes on 23 memory intensive applications. For applications that have nonuniform cache accesses, both prime modulo and odd-multiplier displacement hashing achieve an average speedup of 1.27 compared to traditional hashing, without slowing down any of the 23 benchmarks. We also evaluate using odd-multiplier displacement function with multiple multipliers in conjunction with a skewed associative L2 cache. The skewed associative cache achieves a better average speedup at the cost of some pathological behavior that slows down four applications by up to 7 percent.}, number={5}, journal={IEEE TRANSACTIONS ON COMPUTERS}, author={Kharbutli, M and Solihin, Y and Lee, J}, year={2005}, month={May}, pages={573–586} }