@article{lin_dai_mantor_zhou_2019, title={Coordinated CTA Combination and Bandwidth Partitioning for GPU Concurrent Kernel Execution}, volume={16}, ISSN={["1544-3973"]}, DOI={10.1145/3326124}, abstractNote={Contemporary GPUs support multiple kernels to run concurrently on the same streaming multiprocessors (SMs). Recent studies have demonstrated that such concurrent kernel execution (CKE) improves both resource utilization and computational throughput. Most of the prior works focus on partitioning the GPU resources at the cooperative thread array (CTA) level or the warp scheduler level to improve CKE. However, significant performance slowdown and unfairness are observed when latency-sensitive kernels co-run with bandwidth-intensive ones. The reason is that bandwidth over-subscription from bandwidth-intensive kernels leads to much aggravated memory access latency, which is highly detrimental to latency-sensitive kernels. Even among bandwidth-intensive kernels, more intensive kernels may unfairly consume much higher bandwidth than less-intensive ones.}, number={3}, journal={ACM TRANSACTIONS ON ARCHITECTURE AND CODE OPTIMIZATION}, author={Lin, Zhen and Dai, Hongwen and Mantor, Michael and Zhou, Huiyang}, year={2019}, month={Aug} } @inproceedings{lin_alshboul_solihin_zhou_2019, title={Exploring Memory Persistency Models for GPUs}, ISSN={["1089-795X"]}, DOI={10.1109/PACT.2019.00032}, abstractNote={Given its high integration density, high speed, byte addressability, and low standby power, non-volatile or persistent memory is expected to supplement/replace DRAM as main memory. Through persistency programming model (which defines durability ordering of stores) and durable transaction constructs, the programmer can provide recoverable data structure (RDS) which allows programs to recover to a consistent state after a failure. While persistency models have been well studied for CPUs, they have been neglected for graphics processing units (GPUs). Considering the importance of GPUs as a dominant accelerator for high performance computing, we investigate persistency models for GPUs. GPU applications exhibit substantial differences with CPUs applications, hence in this paper we adapt, re-architect, and optimize CPU persistency models for GPUs. We design a pragma-based compiler scheme for expressing persistency model for GPUs. We identify that the thread hierarchy in GPUs offers intuitive scopes to form epochs and durable transactions. We find that undo logging produces significant performance overheads. We propose to use idempotency analysis to reduce both logging frequency and the size of logs. Through both real-system and simulation evaluations, we show low overheads of our proposed architecture support.}, booktitle={28th International Conference on Parallel Architectures and Compilation Techniques (PACT)}, author={Lin, Zhen and Alshboul, Mohammad and Solihin, Yan and Zhou, Huiyang}, year={2019}, pages={310–322} } @article{lin_mathur_zhou_2019, title={Scatter-and-Gather Revisited: High-Performance Side-Channel-Resistant AES on GPUs}, DOI={10.1145/3300053.3319415}, abstractNote={Recent works have shown that there exist microarchitectural timing channels in contemporary GPUs, which make table-based cryptographic algorithms like AES vulnerable to side channel timing attacks. Also, table-based cryptographic algorithms have been known to be vulnerable to prime-and-probe attacks due to their key-dependent footprint in the data cache. Such analysis casts serious concerns on the feasibility of accelerating table-based cryptographic algorithms on GPUs. In this paper, we revisit the scatter-and-gather (SG) approach and make a case for using this approach to implement table-based cryptographic algorithms on GPUs to achieve both high performance and strong resistance to side channel attacks. Our results show that our SG-based AES achieves both high performance and strong resistance against all the known side channel attacks on these different generations of NVIDIA GPUs. We also reveal unexpected findings on a new timing channel in the L1 data cache (D-cache) on NVIDIA Maxwell and Pascal GPUs.}, journal={12TH WORKSHOP ON GENERAL PURPOSE PROCESSING USING GPUS (GPGPU 12)}, author={Lin, Zhen and Mathur, Utkarsh and Zhou, Huiyang}, year={2019}, pages={2–11} } @inproceedings{dai_lin_li_zhao_wang_zheng_zhou_2018, title={Accelerate GPU Concurrent Kernel Execution by Mitigating Memory Pipeline Stalls}, url={http://dx.doi.org/10.1109/hpca.2018.00027}, DOI={10.1109/hpca.2018.00027}, abstractNote={Following the advances in technology scaling, graphics processing units (GPUs) incorporate an increasing amount of computing resources and it becomes difficult for a single GPU kernel to fully utilize the vast GPU resources. One solution to improve resource utilization is concurrent kernel execution (CKE). Early CKE mainly targets the leftover resources. However, it fails to optimize the resource utilization and does not provide fairness among concurrent kernels. Spatial multitasking assigns a subset of streaming multiprocessors (SMs) to each kernel. Although achieving better fairness, the resource underutilization within an SM is not addressed. Thus, intra-SM sharing has been proposed to issue thread blocks from different kernels to each SM. However, as shown in this study, the overall performance may be undermined in the intra-SM sharing schemes due to the severe interference among kernels. Specifically, as concurrent kernels share the memory subsystem, one kernel, even as computing-intensive, may starve from not being able to issue memory instructions in time. Besides, severe L1 D-cache thrashing and memory pipeline stalls caused by one kernel, especially a memory-intensive one, will impact other kernels, further hurting the overall performance. In this study, we investigate various approaches to overcome the aforementioned problems exposed in intra-SM sharing. We first highlight that cache partitioning techniques proposed for CPUs are not effective for GPUs. Then we propose two approaches to reduce memory pipeline stalls. The first is to balance memory accesses of concurrent kernels. The second is to limit the number of inflight memory instructions issued from individual kernels. Our evaluation shows that the proposed schemes significantly improve the weighted speedup of two state-of-the-art intra-SM sharing schemes, Warped-Slicer and SMK, by 24.6% and 27.2% on average, respectively, with lightweight hardware overhead.}, booktitle={2018 IEEE International Symposium on High Performance Computer Architecture (HPCA)}, author={Dai, H. and Lin, Z. and Li, C. and Zhao, C. and Wang, F. and Zheng, N. and Zhou, H.}, year={2018}, month={Feb} } @article{lin_mantor_zhou_2018, title={GPU Performance vs. Thread-Level Parallelism: Scalability Analysis and a Novel Way to Improve TLP}, volume={15}, ISSN={["1544-3973"]}, DOI={10.1145/3177964}, abstractNote={Graphics Processing Units (GPUs) leverage massive thread-level parallelism (TLP) to achieve high computation throughput and hide long memory latency. However, recent studies have shown that the GPU performance does not scale with the GPU occupancy or the degrees of TLP that a GPU supports, especially for memory-intensive workloads. The current understanding points to L1 D-cache contention or off-chip memory bandwidth. In this article, we perform a novel scalability analysis from the perspective of throughput utilization of various GPU components, including off-chip DRAM, multiple levels of caches, and the interconnect between L1 D-caches and L2 partitions. We show that the interconnect bandwidth is a critical bound for GPU performance scalability.}, number={1}, journal={ACM TRANSACTIONS ON ARCHITECTURE AND CODE OPTIMIZATION}, author={Lin, Zhen and Mantor, Michael and Zhou, Huiyang}, year={2018}, month={Apr} } @inproceedings{lin_nyland_zhou_2016, title={Enabling efficient preemption for SIMT architectures with lightweight context switching}, DOI={10.1109/sc.2016.76}, abstractNote={Context switching is a key technique enabling preemption and time-multiplexing for CPUs. However, for single-instruction multiple-thread (SIMT) processors such as high-end graphics processing units (GPUs), it is challenging to support context switching due to the massive number of threads, which leads to a huge amount of architectural states to be swapped during context switching. The architectural state of SIMT processors includes registers, shared memory, SIMT stacks and barrier states. Recent works present thread-block-level preemption on SIMT processors to avoid context switching overhead. However, because the execution time of a thread block (TB) is highly dependent on the kernel program. The response time of preemption cannot be guaranteed and some TB-level preemption techniques cannot be applied to all kernel functions. In this paper, we propose three complementary ways to reduce and compress the architectural states to achieve lightweight context switching on SIMT processors. Experiments show that our approaches can reduce the register context size by 91.5% on average. Based on lightweight context switching, we enable instruction-level preemption on SIMT processors with compiler and hardware co-design. With our proposed schemes, the preemption latency is reduced by 59.7% on average compared to the naive approach.}, booktitle={SC '16: Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis}, author={Lin, Zhen and Nyland, L. and Zhou, H. Y.}, year={2016}, pages={898–908} } @inproceedings{li_yang_lin_zhou_2015, title={Automatic data placement into GPU on-chip memory resources}, DOI={10.1109/cgo.2015.7054184}, abstractNote={Although graphics processing units (GPUs) rely on thread-level parallelism to hide long off-chip memory access latency, judicious utilization of on-chip memory resources, including register files, shared memory, and data caches, is critical to application performance. However, explicitly managing GPU on-chip memory resources is a non-trivial task for application developers. More importantly, as on-chip memory resources vary among different GPU generations, performance portability has become a daunting challenge. In this paper, we tackle this problem with compiler-driven automatic data placement. We focus on programs that have already been reasonably optimized either manually by programmers or automatically by compiler tools. Our proposed compiler algorithms refine these programs by revising data placement across different types of GPU on-chip resources to achieve both performance enhancement and performance portability. Among 12 benchmarks in our study, our proposed compiler algorithm improves the performance by 1.76× on average on Nvidia GTX480, and by 1.61× on average on GTX680.}, booktitle={2015 IEEE/ACM International Symposium on Code Generation and Optimization (CGO)}, author={Li, C. and Yang, Y. and Lin, Zhen and Zhou, H. Y.}, year={2015}, pages={23–33} } @inbook{gles: a practical gpgpu optimizing compiler using data sharing and thread coarsening_2015, url={http://dx.doi.org/10.1007/978-3-319-17473-0_3}, DOI={10.1007/978-3-319-17473-0_3}, abstractNote={Writing optimized CUDA programs for General Purpose Graphics Processing Unit (GPGPU) is complicated and error-prone. Most of the former compiler optimization methods are impractical for many applications that contain divergent control flows, and they failed to fully exploit optimization opportunities in data sharing and thread coarsening. In this paper, we present GLES, an optimizing compiler for GPGPU programs. GLES proposes two optimization techniques based on divergence analysis. The first one is data sharing optimization for data reuse and bandwidth enhancement. The other one is thread granularity coarsening for reducing redundant instructions. Our experiments on 6 real-world programs show that GPGPU programs optimized by GLES achieve similar performance compared with manually tuned GPGPU programs. Furthermore, GLES is not only applicable to a much wider range of GPGPU programs than the state-of-art GPGPU optimizing compiler, but it also achieves higher or close performance on 8 out of 9 benchmarks.}, booktitle={Languages and Compilers for Parallel Computing}, year={2015} } @inproceedings{implementation and evaluation of deep neural networks (dnn) on mainstream heterogeneous systems_2014, url={http://dx.doi.org/10.1145/2637166.2637229}, DOI={10.1145/2637166.2637229}, abstractNote={Deep Neural Networks (DNN), with deep layers and very high dimension of parameters, have demonstrated break-through learning capability in machine learning area. These days DNN with Big Data input are leading a new direction in large scale object recognition. DNN training requires vast amount of computing power, which poses great challenge to system design. DNN training embraces massive thread and data parallelism, which matches naturally with GPU. There are various heterogeneous systems including discrete CPU armed with GPUs and chip level integrated CPU+GPU heterogeneous processors-named APUs. In this paper, we explore the implementation of DNN models on different heterogenous platforms to provide systematic evaluation and comparison. Specifically we implement two well-known DNN kernels Multi-Layer Perceptron (MLP) and Autoencoder on various GPUs and APUs from mainstream processor manufacturers. Evaluations results show GPUs are faster than APUs but at the cost of burning much more power. APUs achieve upto 2x higher performance per watt efficiency, which indicates that APU server can be an energy efficient and high density solution to accelerate DNN applications. This paper also conducts bottleneck analysis and presents the optimized techniques on various platforms.}, booktitle={Proceedings of 5th Asia-Pacific Workshop on Systems - APSys '14}, year={2014} }