@article{lin_dai_mantor_zhou_2019, title={Coordinated CTA Combination and Bandwidth Partitioning for GPU Concurrent Kernel Execution}, volume={16}, ISSN={["1544-3973"]}, DOI={10.1145/3326124}, abstractNote={Contemporary GPUs support multiple kernels to run concurrently on the same streaming multiprocessors (SMs). Recent studies have demonstrated that such concurrent kernel execution (CKE) improves both resource utilization and computational throughput. Most of the prior works focus on partitioning the GPU resources at the cooperative thread array (CTA) level or the warp scheduler level to improve CKE. However, significant performance slowdown and unfairness are observed when latency-sensitive kernels co-run with bandwidth-intensive ones. The reason is that bandwidth over-subscription from bandwidth-intensive kernels leads to much aggravated memory access latency, which is highly detrimental to latency-sensitive kernels. Even among bandwidth-intensive kernels, more intensive kernels may unfairly consume much higher bandwidth than less-intensive ones.}, number={3}, journal={ACM TRANSACTIONS ON ARCHITECTURE AND CODE OPTIMIZATION}, author={Lin, Zhen and Dai, Hongwen and Mantor, Michael and Zhou, Huiyang}, year={2019}, month={Aug} }
 @article{dai_li_zhou_gupta_kartsaklis_mantor_2016, title={A Model-Driven Approach to Warp/Thread-Block Level GPU Cache Bypassing}, ISSN={["0738-100X"]}, DOI={10.1145/2897937.2897966}, abstractNote={The high amount of memory requests from massive threads may easily cause cache contention and cache-miss-related resource congestion on GPUs. This paper proposes a simple yet effective performance model to estimate the impact of cache contention and resource congestion as a function of the number of warps/thread blocks (TBs) to bypass the cache. Then we design a hardware-based dynamic warp/thread-block level GPU cache bypassing scheme, which achieves 1.68x speedup on average on a set of memory-intensive benchmarks over the baseline. Compared to prior works, our scheme achieves 21.6% performance improvement over SWL-best [29] and 11.9% over CBWT-best [4] on average.}, journal={2016 ACM/EDAC/IEEE DESIGN AUTOMATION CONFERENCE (DAC)}, author={Dai, Hongwen and Li, Chao and Zhou, Huiyang and Gupta, Saurabh and Kartsaklis, Christos and Mantor, Mike}, year={2016} }
 @inproceedings{mayank_dai_wei_huiyang_2015, title={Analyzing graphics processor unit (GPU) instruction set architectures}, DOI={10.1109/ispass.2015.7095794}, abstractNote={Because of their high throughput and power efficiency, massively parallel architectures like graphics processing units (GPUs) become a popular platform for generous purpose computing. However, there are few studies and analyses on GPU instruction set architectures (ISAs) although it is wellknown that the ISA is a fundamental design issue of all modern processors including GPUs.}, booktitle={Ieee international symposium on performance analysis of systems and}, author={Mayank, K. and Dai, H. W. and Wei, J. Z. and Huiyang}, year={2015}, pages={155–156} }
 @inproceedings{li_yang_dai_yan_mueller_zhou_2014, title={Understanding the tradeoffs between software-managed vs. hardware-managed caches in GPUs}, booktitle={Ieee international symposium on performance analysis of systems and}, author={Li, C. and Yang, Y. and Dai, H. W. and Yan, S. G. and Mueller, F. and Zhou, H. Y.}, year={2014}, pages={231–241} }