@article{zhong_li_zhou_wang_2018, title={Developing Noise-Resistant Three-Dimensional Single Particle Tracking Using Deep Neural Networks}, volume={90}, ISSN={["1520-6882"]}, DOI={10.1021/acs.analchem.8b01334}, abstractNote={Three-dimensional single particle tracking (3D SPT) is a powerful tool in various chemical and biological studies. In 3D SPT, z sensitive point spread functions (PSFs) are frequently used to generate different patterns, from which the axial position of the probe can be recovered in addition to its xy coordinates. Conventional linear classifier-based methods, for example, the correlation coefficient method, perform poorly when the signal-to-noise ratio (S/N) drops. In this work, we test deep neural networks (DNNs) in recognizing and differentiating very similar image patterns incurred in 3D SPT. The training of the deep neural networks is optimized, and a procedure is established for 3D localization. We show that for high S/N images, both DNNs and conventional correlation coefficient-based method perform well. However, when the S/N drops close to 1, conventional methods completely fail while DNNs show strong resistance to both artificial and experimental noises. This noise resistance allows us to achieve a camera integration time of 50 μs for 200 nm fluorescent particles without losing accuracy significantly. This study sheds new light on developing robust image data analysis methods and on improving the time resolution of 3D SPT.}, number={18}, journal={ANALYTICAL CHEMISTRY}, author={Zhong, Yaning and Li, Chao and Zhou, Huiyang and Wang, Gufeng}, year={2018}, month={Sep}, pages={10748–10757} } @article{dai_li_zhou_gupta_kartsaklis_mantor_2016, title={A Model-Driven Approach to Warp/Thread-Block Level GPU Cache Bypassing}, ISSN={["0738-100X"]}, DOI={10.1145/2897937.2897966}, abstractNote={The high amount of memory requests from massive threads may easily cause cache contention and cache-miss-related resource congestion on GPUs. This paper proposes a simple yet effective performance model to estimate the impact of cache contention and resource congestion as a function of the number of warps/thread blocks (TBs) to bypass the cache. Then we design a hardware-based dynamic warp/thread-block level GPU cache bypassing scheme, which achieves 1.68x speedup on average on a set of memory-intensive benchmarks over the baseline. Compared to prior works, our scheme achieves 21.6% performance improvement over SWL-best [29] and 11.9% over CBWT-best [4] on average.}, journal={2016 ACM/EDAC/IEEE DESIGN AUTOMATION CONFERENCE (DAC)}, author={Dai, Hongwen and Li, Chao and Zhou, Huiyang and Gupta, Saurabh and Kartsaklis, Christos and Mantor, Mike}, year={2016} } @inproceedings{li_yang_feng_chakradhar_huiyang_2016, title={Optimizing memory efficiency for deep convolutional neural networks on GPUs}, DOI={10.1109/sc.2016.53}, abstractNote={Leveraging large data sets, deep Convolutional Neural Networks (CNNs) achieve state-of-the-art recognition accuracy. Due to the substantial compute and memory operations, however, they require significant execution time. The massive parallel computing capability of GPUs make them as one of the ideal platforms to accelerate CNNs and a number of GPU-based CNN libraries have been developed. While existing works mainly focus on the computational efficiency of CNNs, the memory efficiency of CNNs have been largely overlooked. Yet CNNs have intricate data structures and their memory behavior can have significant impact on the performance. In this work, we study the memory efficiency of various CNN layers and reveal the performance implication from both data layouts and memory access patterns. Experiments show the universal effect of our proposed optimizations on both single layers and various networks, with up to 27.9× for a single layer and up to 5.6× on the whole networks.}, booktitle={SC '16: Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis}, author={Li, C. and Yang, Y. and Feng, M. and Chakradhar, S. and Huiyang}, year={2016}, pages={633–644} } @inproceedings{li_yang_lin_zhou_2015, title={Automatic data placement into GPU on-chip memory resources}, DOI={10.1109/cgo.2015.7054184}, abstractNote={Although graphics processing units (GPUs) rely on thread-level parallelism to hide long off-chip memory access latency, judicious utilization of on-chip memory resources, including register files, shared memory, and data caches, is critical to application performance. However, explicitly managing GPU on-chip memory resources is a non-trivial task for application developers. More importantly, as on-chip memory resources vary among different GPU generations, performance portability has become a daunting challenge. In this paper, we tackle this problem with compiler-driven automatic data placement. We focus on programs that have already been reasonably optimized either manually by programmers or automatically by compiler tools. Our proposed compiler algorithms refine these programs by revising data placement across different types of GPU on-chip resources to achieve both performance enhancement and performance portability. Among 12 benchmarks in our study, our proposed compiler algorithm improves the performance by 1.76× on average on Nvidia GTX480, and by 1.61× on average on GTX680.}, booktitle={2015 IEEE/ACM International Symposium on Code Generation and Optimization (CGO)}, author={Li, C. and Yang, Y. and Lin, Zhen and Zhou, H. Y.}, year={2015}, pages={23–33} } @article{yang_li_zhou_2015, title={CUDA-NP: Realizing Nested Thread-Level Parallelism in GPGPU Applications}, volume={30}, ISSN={["1860-4749"]}, DOI={10.1007/s11390-015-1500-y}, abstractNote={Parallel programs consist of series of code sections with different thread-level parallelism (TLP). As a result, it is rather common that a thread in a parallel program, such as a GPU kernel in CUDA programs, still contains both sequential code and parallel loops. In order to leverage such parallel loops, the latest NVIDIA Kepler architecture introduces dynamic parallelism, which allows a GPU thread to start another GPU kernel, thereby reducing the overhead of launching kernels from a CPU. However, with dynamic parallelism, a parent thread can only communicate with its child threads through global memory and the overhead of launching GPU kernels is non-trivial even within GPUs. In this paper, we first study a set of GPGPU benchmarks that contain parallel loops, and highlight that these benchmarks do not have a very high loop count or high degree of TLP. Consequently, the benefits of leveraging such parallel loops using dynamic parallelism are too limited to offset its overhead. We then present our proposed solution to exploit nested parallelism in CUDA, referred to as CUDA-NP. With CUDA-NP, we initially enable a high number of threads when a GPU program starts, and use control flow to activate different numbers of threads for different code sections. We implement our proposed CUDA-NP framework using a directive-based compiler approach. For a GPU kernel, an application developer only needs to add OpenMP-like pragmas for parallelizable code sections. Then, our CUDA-NP compiler automatically generates the optimized GPU kernels. It supports both the reduction and the scan primitives, explores different ways to distribute parallel loop iterations into threads, and efficiently manages on-chip resource. Our experiments show that for a set of GPGPU benchmarks, which have already been optimized and contain nested parallelism, our proposed CUDA-NP framework further improves the performance by up to 6.69 times and 2.01 times on average.}, number={1}, journal={JOURNAL OF COMPUTER SCIENCE AND TECHNOLOGY}, author={Yang, Yi and Li, Chao and Zhou, Huiyang}, year={2015}, month={Jan}, pages={3–19} } @inproceedings{li_yang_dai_yan_mueller_zhou_2014, title={Understanding the tradeoffs between software-managed vs. hardware-managed caches in GPUs}, booktitle={Ieee international symposium on performance analysis of systems and}, author={Li, C. and Yang, Y. and Dai, H. W. and Yan, S. G. and Mueller, F. and Zhou, H. Y.}, year={2014}, pages={231–241} } @inproceedings{yan_li_zhang_zhou_2014, place={New York}, title={yaSpM: Yet Another SpMV Framework on GPUs}, volume={49}, ISSN={["1558-1160"]}, DOI={10.1145/2692916.2555255}, abstractNote={SpMV is a key linear algebra algorithm and has been widely used in many important application domains. As a result, numerous attempts have been made to optimize SpMV on GPUs to leverage their massive computational throughput. Although the previous work has shown impressive progress, load imbalance and high memory bandwidth remain the critical performance bottlenecks for SpMV. In this paper, we present our novel solutions to these problems. First, we devise a new SpMV format, called blocked compressed common coordinate (BCCOO), which uses bit flags to store the row indices in a blocked common coordinate (COO) format so as to alleviate the bandwidth problem. We further improve this format by partitioning the matrix into vertical slices to enhance the cache hit rates when accessing the vector to be multiplied. Second, we revisit the segmented scan approach for SpMV to address the load imbalance problem. We propose a highly efficient matrix-based segmented sum/scan for SpMV and further improve it by eliminating global synchronization. Then, we introduce an auto-tuning framework to choose optimization parameters based on the characteristics of input sparse matrices and target hardware platforms. Our experimental results on GTX680 GPUs and GTX480 GPUs show that our proposed framework achieves significant performance improvement over the vendor tuned CUSPARSE V5.0 (up to 229% and 65% on average on GTX680 GPUs, up to 150% and 42% on average on GTX480 GPUs) and some most recently proposed schemes (e.g., up to 195% and 70% on average over clSpMV on GTX680 GPUs, up to 162% and 40% on average over clSpMV on GTX480 GPUs).}, number={8}, booktitle={Proceedings of the 19th ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming}, publisher={Association for Computing Machinery}, author={Yan, S. and Li, C. and Zhang, Y. and Zhou, H.}, year={2014}, month={Feb}, pages={107–118} }