@article{gupta_xiang_yang_zhou_2013, title={Locality principle revisited: A probability-based quantitative approach}, volume={73}, ISSN={["1096-0848"]}, DOI={10.1016/j.jpdc.2013.01.010}, abstractNote={This paper revisits the fundamental concept of the locality of references and proposes to quantify it as a conditional probability: in an address stream, given the condition that an address is accessed, how likely the same address (temporal locality) or an address within its neighborhood (spatial locality) will be accessed in the near future. Previous works use reuse distance histograms as a measure of temporal locality. For spatial locality, some ad hoc metrics have been proposed as a quantitative measure. In contrast, our conditional probability-based locality measure has a clear mathematical meaning and provides a theoretically sound and unified way to quantify both temporal and spatial locality. We showcase that our quantified locality measure can be used to evaluate compiler optimizations, to analyze the locality at different levels of memory hierarchy, to optimize the cache architecture to effectively leverage the locality, and to examine the effect of data prefetching mechanisms.}, number={7}, journal={JOURNAL OF PARALLEL AND DISTRIBUTED COMPUTING}, author={Gupta, Saurabh and Xiang, Ping and Yang, Yi and Zhou, Huiyang}, year={2013}, month={Jul}, pages={1011–1027} } @article{yang_zhou_2013, title={The Implementation of a High Performance GPGPU Compiler}, volume={41}, ISSN={["1573-7640"]}, DOI={10.1007/s10766-012-0228-3}, number={6}, journal={INTERNATIONAL JOURNAL OF PARALLEL PROGRAMMING}, author={Yang, Yi and Zhou, Huiyang}, year={2013}, month={Dec}, pages={768–781} } @article{yang_xiang_kong_mantor_zhou_2012, title={A Unified Optimizing Compiler Framework for Different GPGPU Architectures}, volume={9}, ISSN={["1544-3973"]}, DOI={10.1145/2207222.2207225}, abstractNote={This article presents a novel optimizing compiler for general purpose computation on graphics processing units (GPGPU). It addresses two major challenges of developing high performance GPGPU programs: effective utilization of GPU memory hierarchy and judicious management of parallelism. The input to our compiler is a naïve GPU kernel function, which is functionally correct but without any consideration for performance optimization. The compiler generates two kernels, one optimized for global memories and the other for texture memories. The proposed compilation process is effective for both AMD/ATI and NVIDIA GPUs. The experiments show that our optimized code achieves very high performance, either superior or very close to highly fine-tuned libraries.}, number={2}, journal={ACM TRANSACTIONS ON ARCHITECTURE AND CODE OPTIMIZATION}, author={Yang, Yi and Xiang, Ping and Kong, Jingfei and Mantor, Mike and Zhou, Huiyang}, year={2012}, month={Jun} } @inproceedings{yang_xiang_mantor_zhou_2012, title={CPU-assisted GPGPU on fused CPU-GPU architectures}, booktitle={International symposium on high-performance computer}, author={Yang, Y. and Xiang, P. and Mantor, M. and Zhou, H. Y.}, year={2012}, pages={103–114} } @inproceedings{gupta_xiang_yang_huiyang_2012, title={Locality principle revisited: A probability-based quantitative approach}, DOI={10.1109/ipdps.2012.93}, abstractNote={This paper revisits the fundamental concept of the locality of references and proposes to quantify it as a conditional probability: in an address stream, given the condition that an address is accessed, how likely the same address (temporal locality) or an address within its neighborhood (spatial locality) will be accessed in the near future. Based on this definition, spatial locality is a function of two parameters, the neighborhood size and the scope of near future, and can be visualized with a 3D mesh. Temporal locality becomes a special case of spatial locality with the neighborhood size being zero byte. Previous works on locality analysis use stack/reuse distances to compute distance histograms as a measure of temporal locality. For spatial locality, some ad-hoc metrics have been proposed as a quantitative measure. In contrast, our conditional probability-based locality measure has a clear mathematical meaning, offers justification for distance histograms, and provides a theoretically sound and unified way to quantify both temporal and spatial locality. The proposed locality measure clearly exhibits the inherent application characteristics, from which we can easily derive information such as the sizes of the working data sets and how locality can be exploited. We showcase that our quantified locality visualized in 3D-meshes can be used to evaluate compiler optimizations, to analyze the locality at different levels of memory hierarchy, to optimize the cache architecture to effectively leverage the locality, and to examine the effect of data prefetching mechanisms. A GPU-based parallel algorithm is also presented to accelerate the locality computation for large address traces.}, booktitle={2012 ieee 26th international parallel and distributed processing symposium (ipdps)}, author={Gupta, S. and Xiang, P. and Yang, Y. and Huiyang}, year={2012}, pages={995–1009} } @article{yang_xiang_kong_zhou_2010, title={An Optimizing Compiler for GPGPU Programs with Input-Data Sharing}, volume={45}, ISSN={["1558-1160"]}, DOI={10.1145/1837853.1693505}, abstractNote={Developing high performance GPGPU programs is challenging for application developers since the performance is dependent upon how well the code leverages the hardware features of specific graphics processors. To solve this problem and relieve application developers of low-level hardware-specific optimizations, we introduce a novel compiler to optimize GPGPU programs. Our compiler takes a naive GPU kernel function, which is functionally correct but without any consideration for performance optimization. The compiler then analyzes the code, identifies memory access patterns, and generates optimized code. The proposed compiler optimizations target at one category of scientific and media processing algorithms, which has the characteristics of input-data sharing when computing neighboring output pixels/elements. Many commonly used algorithms, such as matrix multiplication, convolution, etc., share such characteristics. For these algorithms, novel approaches are proposed to enforce memory coalescing and achieve effective data reuse. Data prefetching and hardware-specific tuning are also performed automatically with our compiler framework. The experimental results based on a set of applications show that our compiler achieves very high performance, either superior or very close to the highly fine-tuned library, NVIDIA CUBLAS 2.1.}, number={5}, journal={ACM SIGPLAN NOTICES}, author={Yang, Yi and Xiang, Ping and Kong, Jingfei and Zhou, Huiyang}, year={2010}, month={May}, pages={343–344} } @article{yang_xiang_kong_zhou_2010, title={An Optimizing Compiler for GPGPU Programs with Input-Data Sharing}, ISBN={["978-1-60558-708-0"]}, DOI={10.1145/1693453.1693505}, abstractNote={Developing high performance GPGPU programs is challenging for application developers since the performance is dependent upon how well the code leverages the hardware features of specific graphics processors. To solve this problem and relieve application developers of low-level hardware-specific optimizations, we introduce a novel compiler to optimize GPGPU programs. Our compiler takes a naive GPU kernel function, which is functionally correct but without any consideration for performance optimization. The compiler then analyzes the code, identifies memory access patterns, and generates optimized code. The proposed compiler optimizations target at one category of scientific and media processing algorithms, which has the characteristics of input-data sharing when computing neighboring output pixels/elements. Many commonly used algorithms, such as matrix multiplication, convolution, etc., share such characteristics. For these algorithms, novel approaches are proposed to enforce memory coalescing and achieve effective data reuse. Data prefetching and hardware-specific tuning are also performed automatically with our compiler framework. The experimental results based on a set of applications show that our compiler achieves very high performance, either superior or very close to the highly fine-tuned library, NVIDIA CUBLAS 2.1.}, journal={PPOPP 2010: PROCEEDINGS OF THE 2010 ACM SIGPLAN SYMPOSIUM ON PRINCIPLES AND PRACTICE OF PARALLEL PROGRAMMING}, author={Yang, Yi and Xiang, Ping and Kong, Jingfei and Zhou, Huiyang}, year={2010}, pages={343–344} }