@article{long_gong_zhang_zhou_2023, title={An Intelligent Framework for Oversubscription Management in CPU-GPU Unified Memory}, volume={21}, ISSN={["1572-9184"]}, DOI={10.1007/s10723-023-09646-1}, abstractNote={Unified virtual memory (UVM) improves GPU programmability by enabling on-demand data movement between CPU memory and GPU memory. However, due to the limited capacity of GPU device memory, oversubscription overhead becomes a major performance bottleneck for data-intensive workloads running on GPUs with UVM. This paper proposes a novel framework for UVM oversubscription management in discrete CPU-GPU systems. It consists of an access pattern classifier followed by a pattern-specific transformer-based model using a novel loss function aiming to reduce page thrashing. A policy engine is designed to leverage the model’s result to perform accurate page prefetching and eviction. Our evaluation shows that our proposed framework significantly outperforms the state-of-the-art (SOTA) methods on a set of 11 memory-intensive benchmarks, reducing the number of pages thrashed by 64.4% under 125% memory oversubscription compared to the baseline, while the SOTA method reduces the number of pages thrashed by 17.3%. Compared to the SOTA method, our solution achieves average IPC improvement of 1.52X and 3.66X under 125% and 150% memory oversubscription.}, number={1}, journal={JOURNAL OF GRID COMPUTING}, author={Long, Xinjian and Gong, Xiangyang and Zhang, Bo and Zhou, Huiyang}, year={2023}, month={Mar} } @article{long_gong_zhang_zhou_2023, title={Deep learning based data prefetching in CPU-GPU unified virtual memory}, volume={174}, ISSN={["1096-0848"]}, DOI={10.1016/j.jpdc.2022.12.004}, abstractNote={Unified Virtual Memory (UVM) relieves the developers from the onus of maintaining complex data structures and explicit data migration by enabling on-demand data movement between CPU memory and GPU memory. However, on-demand paging soon becomes a performance bottleneck of UVM due to the high latency caused by page table walks and data migration over interconnect. Prefetching is considered a promising solution to this problem given its ability to leverage the locality of program memory access patterns. However, existing locality-based prefetching schemes can not handle all the situations. An ideal prefetcher should not only look at narrow regions of the requested address space but also capture global context to deliver a good prediction of the memory access pattern. This paper proposes a novel framework for page prefetching for UVM through deep learning. We first show that a powerful Transformer learning model can provide high accuracy for UVM page prefetching. We then perform analysis to interpret this Transformer model and derive several insights that allow us to design a simpler model to match the unconstrained model's accuracy with orders of magnitude lower cost. We use a pattern-based method to make the UVM page preditor general to different GPU workloads. We evaluate this framework on a set of 11 memory-intensive benchmarks from popular benchmark suites. Our solution outperforms the state-of-the-art (SOTA) UVM framework, improving the performance by 10.89%, improving the device memory page hit rate by 16.98% (89.02% vs. 76.10% for prior art), and reducing the CPU-GPU interconnect traffic by 11.05%. According to our proposed unified metric, which combines the accuracy, coverage, and page hit rate, our solution is approaching the ideal prefetching scheme more than the SOTA design (0.90 vs. 0.85, with the perfect prefetcher of 1.0).}, journal={JOURNAL OF PARALLEL AND DISTRIBUTED COMPUTING}, author={Long, Xinjian and Gong, Xiangyang and Zhang, Bo and Zhou, Huiyang}, year={2023}, month={Apr}, pages={19–31} } @article{li_liu_patil_hovland_zhou_2023, title={Enhancing Virtual Distillation with Circuit Cutting for Quantum Error Mitigation}, ISSN={["1063-6404"]}, DOI={10.1109/ICCD58817.2023.00024}, abstractNote={Virtual distillation is a technique that aims to mitigate errors in noisy quantum computers. It works by preparing multiple copies of a noisy quantum state, bridging them through a circuit, and conducting measurements. As the number of copies increases, this process allows for the estimation of the expectation value with respect to a state that approaches the ideal pure state rapidly. However, virtual distillation faces a challenge in realistic scenarios: preparing multiple copies of a quantum state and bridging them through a circuit in a noisy quantum computer will significantly increase the circuit size and introduce excessive noise, which will degrade the performance of virtual distillation. To overcome this challenge, we propose an error mitigation strategy that uses circuit-cutting technology to cut the entire circuit into fragments. With this approach, the fragments responsible for generating the noisy quantum state can be executed on a noisy quantum device, while the remaining fragments are efficiently simulated on a noiseless classical simulator. By running each fragment circuit separately on quantum and classical devices and recombining their results, we can reduce the noise accumulation and enhance the effectiveness of the virtual distillation technique. Our strategy has good scalability in terms of both runtime and computational resources. We demonstrate our strategy’s effectiveness through noisy simulation and experiments on a real quantum device.}, journal={2023 IEEE 41ST INTERNATIONAL CONFERENCE ON COMPUTER DESIGN, ICCD}, author={Li, Peiyi and Liu, Ji and Patil, Hrushikesh Pramod and Hovland, Paul and Zhou, Huiyang}, year={2023}, pages={94–101} } @article{tozlu_zhou_2023, title={PBVR: Physically Based Rendering in Virtual Reality}, DOI={10.1109/IISWC59245.2023.00039}, abstractNote={Virtual Reality (VR) is a rapidly growing domain that requires high-fidelity graphics for immersion. To understand and improve the VR architecture, an open-source, end-to-end platform for VR research was recently proposed. However, studying the stereo rendering aspect of VR applications remains challenging due to the lack of infrastructure.In this work, we augment the aforementioned open-source platform, ILLIXR, by integrating a high-end, physically based 3D rendering engine, Filament. This upgrade, named PBVR, enables developers to render high-quality graphics in a completely open-source VR platform.As a case study, we leverage our proposed PBVR to look into gaze-tracked foveated rendering and profile three different scenes. We show that a handful of renderpasses consume the most time and that readily available foveated rendering solutions, such as Variable Rate Shading, might not provide significant advantages. Moreover, our results reveal that eye tracking can incur a significant overhead on the graphics processing unit (GPU).}, journal={2023 IEEE INTERNATIONAL SYMPOSIUM ON WORKLOAD CHARACTERIZATION, IISWC}, author={Tozlu, Yavuz Selim and Zhou, Huiyang}, year={2023}, pages={77–86} } @article{abdullah_zhou_awad_2023, title={Plutus: Bandwidth-Efficient Memory Security for GPUs}, ISSN={["1530-0897"]}, DOI={10.1109/HPCA56546.2023.10071100}, abstractNote={Graphic-Processing Units (GPUs) are increasingly used in systems where security is a critical design requirement. Such systems include cloud computing, safety-critical systems, and edge devices, where sensitive data is processed or/and generated. Thus, the ability to reduce the attack surface while achieving high performance is of utmost importance. However, adding security features to GPUs comes at the expense of high-performance overheads due to the extra memory bandwidth required to handle security metadata. In particular, memory authentication metadata (e.g., authentication tags) along with encryption counters can lead to significant performance overheads due to the memory bandwidth used to fetch the metadata. Such metadata can lead to more than 200% extra bandwidth usage for irregular access patterns.In this work, we propose a novel design, Plutus, which enables low-overhead secure GPU memory. Plutus has three key ideas. The first is to leverage value locality to reduce authentication metadata. Our observation is that a large percentage of memory accesses could be verified without the need to bring the authentication tags. Specifically, through comparing decrypted blocks against known/verified values, we can with high confidence guarantee that no tampering occurred. Our analysis shows that the probability of the decryption of a tampered (and/or replayed) block leading to a known value is extremely low, in fact, lower than the collision probability in the most secure hash functions. Second, based on the observation that many GPU workloads have limited numbers of dirty block evictions, Plutus proposes a second layer of compact counters to reduce the memory traffic due to both the encryption counters and integrity tree. Third, by exploring the interesting tradeoff between the integrity tree organization vs. metadata fetch granularity, Plutus uses smaller block sizes for security metadata caches to optimize the number of security metadata memory requests. Based on our evaluation, Plutus can improve the GPU throughput by 16.86% (up to 58.38%) and reduce the memory bandwidth usage of secure memory by 48.14% (up to 80.30%).}, journal={2023 IEEE INTERNATIONAL SYMPOSIUM ON HIGH-PERFORMANCE COMPUTER ARCHITECTURE, HPCA}, author={Abdullah, Rahaf and Zhou, Huiyang and Awad, Amro}, year={2023}, pages={543–555} } @article{freij_zhou_solihin_2023, title={SecPB: Architectures for Secure Non-Volatile Memory with Battery-Backed Persist Buffers}, ISSN={["1530-0897"]}, DOI={10.1109/HPCA56546.2023.10071082}, abstractNote={The durability of data stored in persistent memory (PM) exposes data to potentially data leakage attacks. Recent research has identified the requirements for crash recoverable secure PM, but do not consider recent trends of the persistency domain extending on-chip to include cache hierarchies. In this paper, we explore this design space and identify performance and energy optimization opportunities.We propose secure persistent buffers (SecPB), a battery-backed persistent structure that moves the point of secure data persistency from the memory controller closer to the core. We revisit the fundamentals of how data in PM is secured and show how various subsets of security metadata can be generated lazily while still guaranteeing crash recoverability and integrity verification. We analyze the metadata dependency chain required in securing PM and expose optimization opportunities that allow for SecPB to reduce performance overheads by up to 32.8×, with average performance overheads as low as 1.3% observed for reasonable battery capacities.}, journal={2023 IEEE INTERNATIONAL SYMPOSIUM ON HIGH-PERFORMANCE COMPUTER ARCHITECTURE, HPCA}, author={Freij, Alexander and Zhou, Huiyang and Solihin, Yan}, year={2023}, pages={677–690} } @article{zhao_gao_nie_zhou_2022, title={A Survey of GPU Multitasking Methods Supported by Hardware Architecture}, volume={33}, ISSN={["1558-2183"]}, url={https://doi.org/10.1109/TPDS.2021.3115630}, DOI={10.1109/TPDS.2021.3115630}, abstractNote={The ability to support multitasking becomes more and more important in the development of graphic processing unit (GPU). GPU multitasking methods are classified into three types: temporal multitasking, spatial multitasking, and simultaneous multitasking (SMK). This article first introduces the features of some commercial GPU architectures to support multitasking and the common metrics used for evaluating the performance of GPU multitasking methods, and then reviews the GPU multitasking methods supported by hardware architecture (i.e., hardware GPU multitasking methods). The main problems of each type of hardware GPU multitasking methods to be solved are illustrated. Meanwhile, the key idea of each previous hardware GPU multitasking method is introduced. In addition, the characteristics of hardware GPU multitasking methods belonging to the same type are compared. This article also gives some valuable suggestions for the future research. An enhanced GPU simulator is needed to bridge the gap between academia and industry. In addition, it is promising to expand the research space with machine learning technologies, advanced GPU architectural innovations, 3D stacked memory, etc. Because most previous GPU multitasking methods are based on NVIDIA GPUs, this article focuses on NVIDIA GPU architecture, and uses NVIDIA's terminology. To our knowledge, this article is the first survey about hardware GPU multitasking methods. We believe that our survey can help the readers gain insights into the research field of hardware GPU multitasking methods.}, number={6}, journal={IEEE TRANSACTIONS ON PARALLEL AND DISTRIBUTED SYSTEMS}, publisher={Institute of Electrical and Electronics Engineers (IEEE)}, author={Zhao, Chen and Gao, Wu and Nie, Feiping and Zhou, Huiyang}, year={2022}, month={Jun}, pages={1451–1463} } @article{yuan_awad_yudha_solihin_zhou_2022, title={Adaptive Security Support for Heterogeneous Memory on GPUs}, ISSN={["1530-0897"]}, DOI={10.1109/HPCA53966.2022.00024}, abstractNote={The wide use of accelerators such as GPUs necessities their security support Recent works [17], [33], [34] pointed out that directly adopting the CPU secure memory design to GPUs could incur significant performance overheads due to the memory bandwidth contention between regular data and security metadata. In this paper, we analyze the security guarantees that used to defend against physical attacks, and make the observation that heterogeneous GPU memory system may not always need all the security mechanisms to achieve the security guarantees. Based on the memory types as well as memory access patterns either explicitly specified in the GPU programming model or implicitly detected at run time, we propose adaptive security memory support for heterogeneous memory on GPUs. Specifically, we first identify the read-only data and propose to only use MAC (Message Authentication Code) to protect their integrity. By eliminating the freshness checks on read-only data, we can use an on-chip shared counter for such data regions and remove the corresponding parts in the Bonsai Merkel Tree (BMT), thereby reducing the traffic due to encryption counters and the BMT. Second, we detect the common streaming data access pattern and propose coarse- grain MACs for such stream data to reduce the MAC access bandwidth. With the hardware-based detection of memory type (read-only or not) and memory access patterns (streaming or not), our proposed approach adapts the security support to significantly reduce the performance overhead without sacrificing the security guarantees. Our evaluation shows that our scheme can achieve secure memory on GPUs with low overheads for memory-intensive workloads. Among the fifteen memory-intensive workloads in our evaluation, our design reduces the performance overheads of secure GPU memory from 53.9% to 8.09% on average. Compared to the state-of- the-art secure memory designs for GPU [17], [33], our scheme outperforms PSSM by up to 41.63% and 9.5% on average and outperforms Common counters by 84.04% on average for memory-intensive workloads. We further propose to use the L2 cache as a victim cache for security metadata when the L2 is either underutilized or suffers from very high miss rates, which further reduces the overheads by up to 4% and 0.65% on average.}, journal={2022 IEEE INTERNATIONAL SYMPOSIUM ON HIGH-PERFORMANCE COMPUTER ARCHITECTURE (HPCA 2022)}, author={Yuan, Shougang and Awad, Amro and Yudha, Ardhi Wiratama Baskara and Solihin, Yan and Zhou, Huiyang}, year={2022}, pages={213–228} } @inproceedings{li_liu_li_zhou_2022, place={New Jersey}, title={Exploiting Quantum Assertions for Error Mitigation and Quantum Program Debugging}, ISSN={["1063-6404"]}, DOI={10.1109/ICCD56317.2022.00028}, abstractNote={An assertion is a predicate that should be evaluated true during program execution. In this paper, we present the development of quantum assertion schemes and show how they are used for hardware error mitigation and software debugging. Compared to assertions in classical programs, quantum assertions are challenging due to the no-cloning theorem and potentially destructive measurement. We discuss how these challenges can be circumvented such that certain properties of quantum states can be verified non-destructively during program execution. Furthermore, we show that besides detecting program bugs, dynamic assertion circuits can mitigate noise effects via post-selection of the assertion results. Our case studies demonstrate the use of quantum assertions in various quantum algorithms.}, booktitle={2022 IEEE 40TH INTERNATIONAL CONFERENCE ON COMPUTER DESIGN (ICCD 2022)}, publisher={IEEE}, author={Li, Peiyi and Liu, Ji and Li, Yangjia and Zhou, Huiyang}, year={2022}, pages={124–131} } @article{yudha_meyer_yuan_zhou_solihin_2022, title={LITE: A Low-Cost Practical Inter-Operable GPU TEE}, DOI={10.1145/3524059.3532361}, abstractNote={There is a strong need for GPU trusted execution environments (TEEs) as GPU is increasingly used in the cloud environment. However, current proposals either ignore memory security (i.e., not encrypting memory) or impose a separate memory encryption domain from the host TEE, causing a very substantial slowdown for communicating data from/to the host. In this paper, we propose a flexible GPU memory encryption design called LITE that relies on software memory encryption aided by small architecture support. LITE's flexibility allows GPU TEE to be co-designed with CPU to create a unified encryption domain. We show that GPU applications can be adapted to the use of LITE encryption APIs without major changes. Through various optimizations, we show that software memory encryption in LITE can produce negligible performance overheads (1.1%) for regular benchmarks and still-acceptable overheads (56%) for irregular benchmarks.}, journal={PROCEEDINGS OF THE 36TH ACM INTERNATIONAL CONFERENCE ON SUPERCOMPUTING, ICS 2022}, author={Yudha, Ardhi Wiratama Baskara and Meyer, Jake and Yuan, Shougang and Zhou, Huiyang and Solihin, Yan}, year={2022} } @article{liu_li_zhou_2022, title={Not All SWAPs Have the Same Cost: A Case for Optimization-Aware Qubit Routing}, ISSN={["1530-0897"]}, DOI={10.1109/HPCA53966.2022.00058}, abstractNote={Despite rapid advances in quantum computing technologies, the qubit connectivity limitation remains to be a critical challenge. Both near-term NISQ quantum computers and relatively long-term scalable quantum architectures do not offer full connectivity. As a result, quantum circuits may not be directly executed on quantum hardware, and a quantum compiler needs to perform qubit routing to make the circuit compatible with the device layout. During the qubit routing step, the compiler inserts SWAP gates and performs circuit transformations. Given the connectivity topology of the target hardware, there are typically multiple qubit routing candidates. The state-of-the-art compilers use a cost function to evaluate the number of SWAP gates for different routes and then select the one with the minimum number of SWAP gates. After qubit routing, the quantum compiler performs gate optimizations upon the circuit with the newly inserted SWAP gates.In this paper, we observe that the aforementioned qubit routing is not optimal, and qubit routing should not be independent on subsequent gate optimizations. We find that with the consideration of gate optimizations, not all of the SWAP gates have the same basis-gate cost. These insights lead to the development of our qubit routing algorithm, NASSC (Not All Swaps have the Same Cost). NASSC is the first algorithm that considers the subsequent optimizations during the routing step. Our optimization-aware qubit routing leads to better routing decisions and benefits subsequent optimizations. We also propose a new optimization-aware decomposition for the inserted SWAP gates. Our experiments show that the routing overhead compiled with our routing algorithm is reduced by up to 69.30% (21.30% on average) in the number of CNOT gates and up to 43.50% (7.61% on average) in the circuit depth compared with the state-of-the-art scheme, SABRE.}, journal={2022 IEEE INTERNATIONAL SYMPOSIUM ON HIGH-PERFORMANCE COMPUTER ARCHITECTURE (HPCA 2022)}, author={Liu, Ji and Li, Peiyi and Zhou, Huiyang}, year={2022}, pages={709–725} } @article{yuan_yudha_solihin_zhou_2021, title={Analyzing Secure Memory Architecture for GPUs}, DOI={10.1109/ISPASS51385.2021.00017}, abstractNote={Wide adoption of cloud computing makes privacy and security a primary concern. Although recent CPUs have integrated secure memory architecture, such support is still missing for GPUs, a key accelerator in data centers. In this paper, we explore two secure memory architectures, counter-mode encryption and direct encryption, for GPUs and show that we need to architect secure memory differently from it for CPUs. Our in-depth study reveals the following insights. First, as GPUs are designed for high-throughput computation, its secure memory needs to deliver high bandwidth. Second, with counter-mode encryption, the memory traffic resulting from the metadata, i.e., the counters, MACs (message-authentication codes), and integrity tree, may cause significant performance degradation, even in the presence of metadata caches. Third, the sectored cache structure adopted by GPUs leads to multiple sequential accesses to the same metadata cache line, which necessitates the use of MSHRs (miss-status handling registers) for meta-data caches. Fourth, unlike CPUs, separate/partitioned metadata caches perform better than unified metadata caches on GPUs. The reason is that GPU workloads feature streaming accesses, which cause severe contention in the unified metadata cache and the cached counters and integrity tree nodes may be evicted before being reused. Fifth, the massive-threaded nature of GPUs make them latency-tolerant and the performance impact due to the extra encryption/decryption latency is limited. As a result, direct encryption can be a promising alternative for GPU secure memory. The challenge, however, lies in memory integrity verification as the integrity tree may incur high storage overhead and metadata traffic.}, journal={2021 IEEE INTERNATIONAL SYMPOSIUM ON PERFORMANCE ANALYSIS OF SYSTEMS AND SOFTWARE (ISPASS 2021)}, author={Yuan, Shougang and Yudha, Ardhi Wiratama Baskara and Solihin, Yan and Zhou, Huiyang}, year={2021}, pages={59–69} } @article{ravi_nguyen_zhou_becchi_2021, title={PILOT: a Runtime System to Manage Multi-tenant GPU Unified Memory Footprint}, ISSN={["1094-7256"]}, DOI={10.1109/HiPC53243.2021.00063}, abstractNote={Concurrent kernel execution on GPU has proven an effective technique to improve system throughput by maximizing the resource utilization. In order to increase programmability and meet the increasing memory requirements of data-intensive applications, current GPUs support Unified Virtual Memory (UVM), which provides a virtual memory abstraction with demand paging. By allowing applications to oversubscribe GPU memory, UVM provides increased opportunities to share GPU resources across applications. However, in the presence of applications with competing memory requirements, GPU sharing can lead to performance degradation due to thrashing. NVIDIA's Multiple Process Service (MPS) offers the capability to space share bare metal GPUs, thereby enabling cluster workload managers, such as Slurm, to share a single GPU across MPI ranks with limited control over resource partitioning. However, it is not possible to preempt, schedule, or throttle a running GPU process through MPS. These features would enable new OS-managed scheduling policies to be implemented for GPU kernels to dynamically handle resource contention and offer consistent performance. The contribution of this paper is two-fold. We first show how memory oversubscription can impact the performance of concurrent GPU applications. Then, we propose three methods to transparently mitigate memory interference through kernel preemption and scheduling policies. To implement our policies, we develop our own runtime system (PILOT) to serve as an alternative to NVIDIA's MPS. In the presence of memory over-subscription, we noticed a dramatic improvement in the overall throughput when using our scheduling policies and runtime hints.}, journal={2021 IEEE 28TH INTERNATIONAL CONFERENCE ON HIGH PERFORMANCE COMPUTING, DATA, AND ANALYTICS (HIPC 2021)}, author={Ravi, John and Nguyen, Tri and Zhou, Huiyang and Becchi, Michela}, year={2021}, pages={442–447} } @article{liu_bello_zhou_2021, title={Relaxed Peephole Optimization: A Novel Compiler Optimization for Quantum Circuits}, ISSN={["2164-2397"]}, DOI={10.1109/CGO51591.2021.9370310}, abstractNote={As in classical computing, compilers play an important role in quantum computing. Quantum processors typically support a limited set of primitive operations or quantum gates and have certain hardware-related limitations. A quantum compiler is responsible for adapting a quantum program to these constraint environments and decomposing quantum gates into a sequence of the primitive ones. During the compilation process, it is also critical for the compiler to optimize the quantum circuits in order to reduce the noise in the computation results. Since the noise is introduced by operations and decoherence, reducing the gate count is the key for improving performance. In this paper, we propose a novel quantum compiler optimization, named relaxed peephole optimization (RPO) for quantum computers. RPO leverages the single-qubit state information that can be determined statically by the compiler. We define that a qubit is in a basis state when, at a given point in time, its state is either in the X-, Y-, or Z-basis (|+) / |-〉, |L〉 / R〉 and 10〉 / |1〉). When basis qubits are used as inputs to quantum gates, there exist opportunities for strength reduction, which replaces quantum operations with equivalent but less expensive ones. Compared to the existing peephole optimization for quantum programs, the difference is that our proposed optimization does not require an identical unitary matrix, thereby named ‘relaxed’ peephole optimization. We also extend our approach to optimize the quantum gates when some input qubits are in known pure states. Both optimizations, namely the Quantum Basis-state Optimization (QBO) and the Quantum Pure-state Optimization (QPO), are implemented in the IBM's Qiskit transpiler. Our experimental results show that our proposed optimization pass is fast and effective. The circuits optimized with our compiler optimizations obtain up to 18.0% (11.7% on average) fewer CNOT gates and up to 8.2% (7.1% on average) lower transpilation time than that of the most aggressive optimization level in the Qiskit compiler. When running on real quantum computers, the success rates of 3-qubit quantum phase estimation algorithm improve by 2.30X due to the reduced gate counts.}, journal={CGO '21: PROCEEDINGS OF THE 2021 IEEE/ACM INTERNATIONAL SYMPOSIUM ON CODE GENERATION AND OPTIMIZATION (CGO)}, author={Liu, Ji and Bello, Luciano and Zhou, Huiyang}, year={2021}, pages={301–314} } @article{liu_zhou_2021, title={Systematic Approaches for Precise and Approximate Quantum State Runtime Assertion}, ISSN={["1530-0897"]}, DOI={10.1109/HPCA51647.2021.00025}, abstractNote={With the rapid growth of quantum computing technology, programmers need new tools for debugging quantum programs. Recent works show that assertions are a promising way for debugging quantum programs. However, there are two main drawbacks with the existing schemes. First, the existing schemes, including both statistical and dynamic assertions are only capable of asserting limited types of states, namely classical, superposition, and specific entanglement states. Second, the use cases of these assertions are limited, since the programmer has to know the exact/precise state to assert.In this work, we propose two systematic approaches for dynamic quantum state assertion and they can assert a much broader range of quantum states including both pure states and mixed states. We also introduce the idea of approximate quantum state assertion for the cases where the programmers only have limited knowledge of the quantum states. Approximate assertion is capable of checking membership in a set of states $\{|\psi\rangle,\ |\phi\rangle,\ldots\}$. While precise quantum state assertion can check a specific quantum state, approximate assertion enables a way to check whether the qubits of interest are in a super-set of some expected states, which is analogous to the well-known Bloom filter for membership checking in classical computing. Our experiments demonstrate that our systematic approaches can assert many more quantum states and can be used in various assertion locations for qubit state checking.}, journal={2021 27TH IEEE INTERNATIONAL SYMPOSIUM ON HIGH-PERFORMANCE COMPUTER ARCHITECTURE (HPCA 2021)}, author={Liu, Ji and Zhou, Huiyang}, year={2021}, pages={179–193} } @article{mao_zhou_gui_shen_2020, title={Exploring Convolution Neural Network for Branch Prediction}, volume={8}, url={https://doi.org/10.1109/ACCESS.2020.3017196}, DOI={10.1109/ACCESS.2020.3017196}, abstractNote={Recently, there have been significant advances in deep neural networks (DNNs) and they have shown distinctive performance in speech recognition, natural language processing, and image recognition. In this paper, we explore DNNs to push the limit for branch prediction. We treat branch prediction as a classification problem and employ both deep convolutional neural networks (CNNs), ranging from LeNet to ResNet-50, and deep belief network (DBN) for branch prediction. We compare the effectiveness of DNNs with the state-of-the-art branch predictors, including the perceptron, our prior work, Multi-poTAGE+SC, and MTAGE+SC branch predictors. The last two are the most recent winners of championship branch prediction (CBP) contests. Several interesting observations emerged from our study. First, for branch prediction, the DNNs outperform the perceptron model as high as 60–80%. Second, we analyze the impact of the depth of CNNs (i.e., number of convolutional layers and pooling layers) on the misprediction rates. The results confirm that deeper CNN structures can lead to lower misprediction rates. Third, the DBN could outperform our prior work, but not outperform the state-of-the-art TAGE-like branch predictor; the ResNet-50 could not only outperform our prior work, but also the Multi-poTAGE+SC and MTAGE+SC.}, journal={IEEE Access}, publisher={Institute of Electrical and Electronics Engineers (IEEE)}, author={Mao, Yonghua and Zhou, Huiyang and Gui, Xiaolin and Shen, Junjie}, year={2020}, pages={152008–152016} } @article{zhao_gao_nie_wang_zhou_2020, title={Fair and cache blocking aware warp scheduling for concurrent kernel execution on GPU}, volume={112}, ISSN={["1872-7115"]}, DOI={10.1016/j.future.2020.05.023}, abstractNote={With Graphic Processing Units (GPUs) being widely adopted in data centers to provide computing power, efficient support for GPU multitasking attracts significant attention. The prior GPU multitasking works include spatial multitasking and simultaneous multitasking (SMK). Spatial multitasking allocates GPU resources at the streaming multiprocessor (SM) granularity which is coarse-grained, and SMK runs concurrent kernels on the same SM, therefore is fine-grained. SMK is beneficial to improve GPU resource utilization especially when concurrent kernels have complementary characteristics. However, the main challenge for SMK is the interference among multiple kernels especially the contention on data cache. In this paper, we propose a fair and cache blocking aware warp scheduling (FCBWS) approach to ameliorate the contention on data cache and improve SMK on GPUs. In FCBWS, equal opportunity of issuing instructions is provided to each kernel, and memory pipeline stalls are tried to be avoided by predicting cache blocking. Kernels are extracted from various applications to construct concurrent kernel execution benchmarks. The simulation experiment results show that FCBWS outperforms previous multitasking methods; even compared to the state-of-the-art SMK method, FCBWS can improve system throughput (STP) by 10% on average and reduce average normalized turnaround time (ANTT) by 41% on average.}, journal={FUTURE GENERATION COMPUTER SYSTEMS-THE INTERNATIONAL JOURNAL OF ESCIENCE}, author={Zhao, Chen and Gao, Wu and Nie, Feiping and Wang, Fei and Zhou, Huiyang}, year={2020}, month={Nov}, pages={1093–1105} } @inproceedings{liu_byrd_zhou_2020, place={New York, NY, USA}, title={Quantum Circuits for Dynamic Runtime Assertions in Quantum Computation}, url={http://www.scopus.com/inward/record.url?eid=2-s2.0-85082381669&partnerID=MN8TOARS}, DOI={10.1145/3373376.3378488}, abstractNote={In this paper, we propose quantum circuits for runtime assertions, which can be used for both software debugging and error detection. Runtime assertion is challenging in quantum computing for two key reasons. First, a quantum bit (qubit) cannot be copied, which is known as the non-cloning theorem. Second, when a qubit is measured, its superposition state collapses into a classical state, losing the inherent parallel information. In this paper, we overcome these challenges with runtime computation through ancilla qubits, which are used to indirectly collect the information of the qubits of interest. We design quantum circuits to assert classical states, entanglement, and superposition states. Our experimental results show that they are effective in debugging as well as improving the success rate for various quantum algorithms on IBM Q quantum computers.}, booktitle={ASPLOS '20: Proceedings of the Twenty-Fifth International Conference on Architectural Support for Programming Languages and Operating Systems}, publisher={Association for Computing Machinery}, author={Liu, J. and Byrd, G. and Zhou, H.}, year={2020}, pages={1017–1030} } @article{liu_zhou_2020, title={Reliability Modeling of NISQ-Era Quantum Computers}, DOI={10.1109/IISWC50251.2020.00018}, abstractNote={Recent developments in quantum computers have been pushing up the number of qubits. However, the state-of-the-art Noisy Intermediate Scale Quantum (NISQ) computers still do not have enough qubits to accommodate the error correction circuit. Noise in quantum gates limits the reliability of quantum circuits. To characterize the noise effects, prior methods such as process tomography, gateset tomography and randomized benchmarking have been proposed. However, the challenge is that these methods do not scale well with the number of qubits. Noise models based on the understanding of underneath physics have also been proposed to study different kinds of noise in quantum computers. The difficulty is that there is no widely accepted noise model that incorporates all different kinds of errors. The realworld errors can be very complicated and it remains an active area of research to produce accurate noise models. In this paper, instead of using noise models to estimate the reliability, which is measured with success rates or inference strength, we treat the NISQ quantum computer as a black box. We use several quantum circuit characteristics such as the number of qubits, circuit depth, the number of CNOT gates, and the connection topology of the quantum computer as inputs to the black box and derive a reliability estimation model using (1) polynomial fitting and (2) a shallow neural network. We propose randomized benchmarks with random numbers of qubits and basic gates to generate a large data set for neural network training. We show that the estimated reliability from our black-box model outperforms the noise models from Qiskit. We also showcase that our black-box model can be used to guide quantum circuit optimization at compile time.}, journal={2020 IEEE INTERNATIONAL SYMPOSIUM ON WORKLOAD CHARACTERIZATION (IISWC 2020)}, author={Liu, Ji and Zhou, Huiyang}, year={2020}, pages={94–105} } @article{yudha_kimura_zhou_solihin_2020, title={Scalable and Fast Lazy Persistency on GPUs}, DOI={10.1109/IISWC50251.2020.00032}, abstractNote={GPUs applications, including many scientific and machine learning applications, increasingly demand larger memory capacity. NVM is promising higher density compared to DRAM and better future scaling potentials. Long running GPU applications can benefit from NVM by exploiting its persistency, allowing crash recovery of data in memory. In this paper, we propose mapping Lazy Persistency (LP) to GPUs and identify the design space of such mapping. We then characterize LP performance on GPUs, varying the checksum type, reduction method, use of locking, and hash table designs. Armed with insights into the performance bottlenecks, we propose a hash table-less method that performs well on hundreds and thousands of threads, achieving persistency with nearly negligible (2.1%) slowdown for a variety of representative benchmarks. We also propose a directive-based programming language support to simplify programming effort for adding LP to GPU applications.}, journal={2020 IEEE INTERNATIONAL SYMPOSIUM ON WORKLOAD CHARACTERIZATION (IISWC 2020)}, author={Yudha, Ardhi Wiratama Baskara and Kimura, Keiji and Zhou, Huiyang and Solihin, Yan}, year={2020}, pages={252–263} } @article{lin_dai_mantor_zhou_2019, title={Coordinated CTA Combination and Bandwidth Partitioning for GPU Concurrent Kernel Execution}, volume={16}, ISSN={["1544-3973"]}, DOI={10.1145/3326124}, abstractNote={Contemporary GPUs support multiple kernels to run concurrently on the same streaming multiprocessors (SMs). Recent studies have demonstrated that such concurrent kernel execution (CKE) improves both resource utilization and computational throughput. Most of the prior works focus on partitioning the GPU resources at the cooperative thread array (CTA) level or the warp scheduler level to improve CKE. However, significant performance slowdown and unfairness are observed when latency-sensitive kernels co-run with bandwidth-intensive ones. The reason is that bandwidth over-subscription from bandwidth-intensive kernels leads to much aggravated memory access latency, which is highly detrimental to latency-sensitive kernels. Even among bandwidth-intensive kernels, more intensive kernels may unfairly consume much higher bandwidth than less-intensive ones.}, number={3}, journal={ACM TRANSACTIONS ON ARCHITECTURE AND CODE OPTIMIZATION}, author={Lin, Zhen and Dai, Hongwen and Mantor, Michael and Zhou, Huiyang}, year={2019}, month={Aug} } @inproceedings{lin_alshboul_solihin_zhou_2019, title={Exploring Memory Persistency Models for GPUs}, ISSN={["1089-795X"]}, DOI={10.1109/PACT.2019.00032}, abstractNote={Given its high integration density, high speed, byte addressability, and low standby power, non-volatile or persistent memory is expected to supplement/replace DRAM as main memory. Through persistency programming model (which defines durability ordering of stores) and durable transaction constructs, the programmer can provide recoverable data structure (RDS) which allows programs to recover to a consistent state after a failure. While persistency models have been well studied for CPUs, they have been neglected for graphics processing units (GPUs). Considering the importance of GPUs as a dominant accelerator for high performance computing, we investigate persistency models for GPUs. GPU applications exhibit substantial differences with CPUs applications, hence in this paper we adapt, re-architect, and optimize CPU persistency models for GPUs. We design a pragma-based compiler scheme for expressing persistency model for GPUs. We identify that the thread hierarchy in GPUs offers intuitive scopes to form epochs and durable transactions. We find that undo logging produces significant performance overheads. We propose to use idempotency analysis to reduce both logging frequency and the size of logs. Through both real-system and simulation evaluations, we show low overheads of our proposed architecture support.}, booktitle={28th International Conference on Parallel Architectures and Compilation Techniques (PACT)}, author={Lin, Zhen and Alshboul, Mohammad and Solihin, Yan and Zhou, Huiyang}, year={2019}, pages={310–322} } @inproceedings{guan_ning_lin_shen_zhou_lim_2019, place={San Mateo, CA}, title={In-Place Zero-Space Memory Protection for CNN}, volume={32}, booktitle={Advances in Neural Information Processing Systems}, publisher={Morgan Kaufmann Publishers}, author={Guan, H. and Ning, L. and Lin, Z. and Shen, X. and Zhou, H. and Lim, S.}, editor={Wallach, H and Larochelle, H and Beygelzimer, A and d'Alché-Buc, F and Fox, E. and Garnett, R.Editors}, year={2019} } @article{zhou_byrd_2019, title={Quantum Circuits for Dynamic Runtime Assertions in Quantum Computation}, volume={18}, ISSN={1556-6056 1556-6064 2473-2575}, url={http://dx.doi.org/10.1109/LCA.2019.2935049}, DOI={10.1109/LCA.2019.2935049}, abstractNote={In this paper, we propose quantum circuits for runtime assertions, which can be used for both software debugging and error detection. Runtime assertion is challenging in quantum computing for two key reasons. First, a quantum bit (qubit) cannot be copied, which is known as the non-cloning theorem. Second, when a qubit is measured, its superposition state collapses into a classical state, losing the inherent parallel information. In this paper, we overcome these challenges with runtime computation through ancilla qubits, which are used to indirectly collect the information of the qubits of interest. We design quantum circuits to assert classical states, entanglement, and superposition states.}, number={2}, journal={IEEE Computer Architecture Letters}, publisher={Institute of Electrical and Electronics Engineers (IEEE)}, author={Zhou, Huiyang and Byrd, Gregory T.}, year={2019}, month={Jul}, pages={111–114} } @article{liu_byrd_zhou_2019, title={Quantum Circuits for Dynamic Runtime Assertions in Quantum Computation}, url={https://doi.org/10.36227/techrxiv.11319929}, DOI={10.36227/techrxiv.11319929}, abstractNote={In this paper, we propose quantum circuits to enable dynamic assertions for classical values, entanglement, and superposition. This enables a dynamic debugging primitive, driven by a programmer’s understanding of the correct behavior of the quantum program. We show that besides generating assertion errors, the assertion logic may also force the qubits under test to be into the desired state. Besides debugging, our proposed assertion logic can also be used in noisy intermediate scale quantum (NISQ) systems to filter out erroneous results, as demonstrated on a 20-qubit IBM Q quantum computer. Our proposed assertion circuits have been implemented as functions in the open-source Qiskit tool.}, author={Liu, Ji and Byrd, Greg and Zhou, Huiyang}, year={2019}, month={Dec} } @article{liu_byrd_zhou_2019, title={Quantum Circuits for Dynamic Runtime Assertions in Quantum Computation}, url={https://doi.org/10.36227/techrxiv.11319929.v1}, DOI={10.36227/techrxiv.11319929.v1}, abstractNote={In this paper, we propose quantum circuits to enable dynamic assertions for classical values, entanglement, and superposition. This enables a dynamic debugging primitive, driven by a programmer’s understanding of the correct behavior of the quantum program. We show that besides generating assertion errors, the assertion logic may also force the qubits under test to be into the desired state. Besides debugging, our proposed assertion logic can also be used in noisy intermediate scale quantum (NISQ) systems to filter out erroneous results, as demonstrated on a 20-qubit IBM Q quantum computer. Our proposed assertion circuits have been implemented as functions in the open-source Qiskit tool.}, author={Liu, Ji and Byrd, Greg and Zhou, Huiyang}, year={2019}, month={Dec} } @article{lin_mathur_zhou_2019, title={Scatter-and-Gather Revisited: High-Performance Side-Channel-Resistant AES on GPUs}, DOI={10.1145/3300053.3319415}, abstractNote={Recent works have shown that there exist microarchitectural timing channels in contemporary GPUs, which make table-based cryptographic algorithms like AES vulnerable to side channel timing attacks. Also, table-based cryptographic algorithms have been known to be vulnerable to prime-and-probe attacks due to their key-dependent footprint in the data cache. Such analysis casts serious concerns on the feasibility of accelerating table-based cryptographic algorithms on GPUs. In this paper, we revisit the scatter-and-gather (SG) approach and make a case for using this approach to implement table-based cryptographic algorithms on GPUs to achieve both high performance and strong resistance to side channel attacks. Our results show that our SG-based AES achieves both high performance and strong resistance against all the known side channel attacks on these different generations of NVIDIA GPUs. We also reveal unexpected findings on a new timing channel in the L1 data cache (D-cache) on NVIDIA Maxwell and Pascal GPUs.}, journal={12TH WORKSHOP ON GENERAL PURPOSE PROCESSING USING GPUS (GPGPU 12)}, author={Lin, Zhen and Mathur, Utkarsh and Zhou, Huiyang}, year={2019}, pages={2–11} } @inproceedings{dai_lin_li_zhao_wang_zheng_zhou_2018, title={Accelerate GPU Concurrent Kernel Execution by Mitigating Memory Pipeline Stalls}, url={http://dx.doi.org/10.1109/hpca.2018.00027}, DOI={10.1109/HPCA.2018.00027}, abstractNote={Following the advances in technology scaling, graphics processing units (GPUs) incorporate an increasing amount of computing resources and it becomes difficult for a single GPU kernel to fully utilize the vast GPU resources. One solution to improve resource utilization is concurrent kernel execution (CKE). Early CKE mainly targets the leftover resources. However, it fails to optimize the resource utilization and does not provide fairness among concurrent kernels. Spatial multitasking assigns a subset of streaming multiprocessors (SMs) to each kernel. Although achieving better fairness, the resource underutilization within an SM is not addressed. Thus, intra-SM sharing has been proposed to issue thread blocks from different kernels to each SM. However, as shown in this study, the overall performance may be undermined in the intra-SM sharing schemes due to the severe interference among kernels. Specifically, as concurrent kernels share the memory subsystem, one kernel, even as computing-intensive, may starve from not being able to issue memory instructions in time. Besides, severe L1 D-cache thrashing and memory pipeline stalls caused by one kernel, especially a memory-intensive one, will impact other kernels, further hurting the overall performance. In this study, we investigate various approaches to overcome the aforementioned problems exposed in intra-SM sharing. We first highlight that cache partitioning techniques proposed for CPUs are not effective for GPUs. Then we propose two approaches to reduce memory pipeline stalls. The first is to balance memory accesses of concurrent kernels. The second is to limit the number of inflight memory instructions issued from individual kernels. Our evaluation shows that the proposed schemes significantly improve the weighted speedup of two state-of-the-art intra-SM sharing schemes, Warped-Slicer and SMK, by 24.6% and 27.2% on average, respectively, with lightweight hardware overhead.}, booktitle={2018 IEEE International Symposium on High Performance Computer Architecture (HPCA)}, author={Dai, H. and Lin, Z. and Li, C. and Zhao, C. and Wang, F. and Zheng, N. and Zhou, H.}, year={2018}, month={Feb} } @article{zhong_li_zhou_wang_2018, title={Developing Noise-Resistant Three-Dimensional Single Particle Tracking Using Deep Neural Networks}, volume={90}, ISSN={["1520-6882"]}, DOI={10.1021/acs.analchem.8b01334}, abstractNote={Three-dimensional single particle tracking (3D SPT) is a powerful tool in various chemical and biological studies. In 3D SPT, z sensitive point spread functions (PSFs) are frequently used to generate different patterns, from which the axial position of the probe can be recovered in addition to its xy coordinates. Conventional linear classifier-based methods, for example, the correlation coefficient method, perform poorly when the signal-to-noise ratio (S/N) drops. In this work, we test deep neural networks (DNNs) in recognizing and differentiating very similar image patterns incurred in 3D SPT. The training of the deep neural networks is optimized, and a procedure is established for 3D localization. We show that for high S/N images, both DNNs and conventional correlation coefficient-based method perform well. However, when the S/N drops close to 1, conventional methods completely fail while DNNs show strong resistance to both artificial and experimental noises. This noise resistance allows us to achieve a camera integration time of 50 μs for 200 nm fluorescent particles without losing accuracy significantly. This study sheds new light on developing robust image data analysis methods and on improving the time resolution of 3D SPT.}, number={18}, journal={ANALYTICAL CHEMISTRY}, author={Zhong, Yaning and Li, Chao and Zhou, Huiyang and Wang, Gufeng}, year={2018}, month={Sep}, pages={10748–10757} } @article{lin_mantor_zhou_2018, title={GPU Performance vs. Thread-Level Parallelism: Scalability Analysis and a Novel Way to Improve TLP}, volume={15}, ISSN={["1544-3973"]}, DOI={10.1145/3177964}, abstractNote={Graphics Processing Units (GPUs) leverage massive thread-level parallelism (TLP) to achieve high computation throughput and hide long memory latency. However, recent studies have shown that the GPU performance does not scale with the GPU occupancy or the degrees of TLP that a GPU supports, especially for memory-intensive workloads. The current understanding points to L1 D-cache contention or off-chip memory bandwidth. In this article, we perform a novel scalability analysis from the perspective of throughput utilization of various GPU components, including off-chip DRAM, multiple levels of caches, and the interconnect between L1 D-caches and L2 partitions. We show that the interconnect bandwidth is a critical bound for GPU performance scalability.}, number={1}, journal={ACM TRANSACTIONS ON ARCHITECTURE AND CODE OPTIMIZATION}, author={Lin, Zhen and Mantor, Michael and Zhou, Huiyang}, year={2018}, month={Apr} } @article{verma_zhou_booth_king_coole_keep_marshall_feng_2017, title={Developing Dynamic Profiling and Debugging Support in OpenCL for FPGAs}, ISSN={["0738-100X"]}, DOI={10.1145/3061639.3062230}, abstractNote={With FPGAs emerging as a promising accelerator for general-purpose computing, there is a strong demand to make them accessible to software developers. Recent advances in OpenCL compilers for FPGAs pave the way for synthesizing FPGA hardware from OpenCL kernel code. To enable broader adoption of this paradigm, significant challenges remain. This paper presents our efforts in developing dynamic profiling and debugging support in OpenCL for FPGAs. We first propose primitive code patterns, including a timestamp and an event-ordering function, and then develop a framework, which can be plugged easily into OpenCL kernels, to dynamically collect and process run-time information.}, journal={PROCEEDINGS OF THE 2017 54TH ACM/EDAC/IEEE DESIGN AUTOMATION CONFERENCE (DAC)}, author={Verma, Anshuman and Zhou, Huiyang and Booth, Skip and King, Robbie and Coole, James and Keep, Andy and Marshall, John and Feng, Wu-chun}, year={2017} } @inproceedings{chen_zhao_shen_zhou_2017, title={EffiSha: A Software Framework for Enabling Efficient Preemptive Scheduling of GPU}, DOI={10.1145/3018743.3018748}, abstractNote={Modern GPUs are broadly adopted in many multitasking environments, including data centers and smartphones. However, the current support for the scheduling of multiple GPU kernels (from different applications) is limited, forming a major barrier for GPU to meet many practical needs. This work for the first time demonstrates that on existing GPUs, efficient preemptive scheduling of GPU kernels is possible even without special hardware support. Specifically, it presents EffiSha, a pure software framework that enables preemptive scheduling of GPU kernels with very low overhead. The enabled preemptive scheduler offers flexible support of kernels of different priorities, and demonstrates significant potential for reducing the average turnaround time and improving the system overall throughput of programs that time share a modern GPU.}, booktitle={PPoPP '17: Proceedings of the 22nd ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming}, author={Chen, G. and Zhao, Y. and Shen, X. and Zhou, H.}, year={2017}, pages={3–16} } @book{mao_zhou_gui_2017, place={Raleigh, NC}, title={Exploring deep neural networks for branch prediction}, url={https://people.engr.ncsu.edu/hzhou/CNN_DBN_zhou_2017.pdf}, institution={Electrical and Computer Engineering Department, N.C. State University}, author={Mao, Y. and Zhou, H. and Gui, X.}, year={2017}, month={Sep} } @article{cramer_pohlmann_gomez_mark_kornegay_hall_siraliev-perez_walavalkar_sperlazza_bilinovich_et al._2017, title={Methylation specific targeting of a chromatin remodeling complex from sponges to humans}, volume={7}, ISSN={["2045-2322"]}, DOI={10.1038/srep40674}, abstractNote={Abstract}, journal={SCIENTIFIC REPORTS}, author={Cramer, Jason M. and Pohlmann, Deborah and Gomez, Fernando and Mark, Leslie and Kornegay, Benjamin and Hall, Chelsea and Siraliev-Perez, Edhriz and Walavalkar, Ninad M. and Sperlazza, M. Jeannette and Bilinovich, Stephanie and et al.}, year={2017}, month={Jan} } @inproceedings{dai_li_lin_zhou_2017, title={The Demand for a Sound Baseline in GPU Memory Architecture Research}, url={https://people.engr.ncsu.edu/hzhou/Hongwen_WDDD2017.pdf}, booktitle={14th Annual Workshop on Duplicating, Deconstructing and Debunking (WDDD)}, author={Dai, H. and Li, C. and Lin, Z. and Zhou, H.}, year={2017} } @article{zhang_li_yan_zhou_2016, title={A Cross-Platform SpMV Framework on Many-Core Architectures}, volume={13}, ISSN={1544-3566}, url={http://dx.doi.org/10.1145/2994148}, DOI={10.1145/2994148}, abstractNote={Sparse Matrix-Vector multiplication (SpMV) is a key operation in engineering and scientific computing. Although the previous work has shown impressive progress in optimizing SpMV on many-core architectures, load imbalance and high memory bandwidth remain the critical performance bottlenecks. We present our novel solutions to these problems, for both GPUs and Intel MIC many-core architectures. First, we devise a new SpMV format, called Blocked Compressed Common Coordinate (BCCOO). BCCOO extends the blocked Common Coordinate (COO) by using bit flags to store the row indices to alleviate the bandwidth problem. We further improve this format by partitioning the matrix into vertical slices for better data locality. Then, to address the load imbalance problem, we propose a highly efficient matrix-based segmented sum/scan algorithm for SpMV, which eliminates global synchronization. At last, we introduce an autotuning framework to choose optimization parameters. Experimental results show that our proposed framework has a significant advantage over the existing SpMV libraries. In single precision, our proposed scheme outperforms clSpMV COCKTAIL format by 255% on average on AMD FirePro W8000, and outperforms CUSPARSE V7.0 by 73.7% on average and outperforms CSR5 by 53.6% on average on GeForce Titan X; in double precision, our proposed scheme outperforms CUSPARSE V7.0 by 34.0% on average and outperforms CSR5 by 16.2% on average on Tesla K20, and has equivalent performance compared with CSR5 on Intel MIC.}, number={4}, journal={ACM Transactions on Architecture and Code Optimization}, publisher={Association for Computing Machinery (ACM)}, author={Zhang, Yunquan and Li, Shigang and Yan, Shengen and Zhou, Huiyang}, year={2016}, month={Oct}, pages={1–25} } @article{dai_li_zhou_gupta_kartsaklis_mantor_2016, title={A Model-Driven Approach to Warp/Thread-Block Level GPU Cache Bypassing}, ISSN={["0738-100X"]}, DOI={10.1145/2897937.2897966}, abstractNote={The high amount of memory requests from massive threads may easily cause cache contention and cache-miss-related resource congestion on GPUs. This paper proposes a simple yet effective performance model to estimate the impact of cache contention and resource congestion as a function of the number of warps/thread blocks (TBs) to bypass the cache. Then we design a hardware-based dynamic warp/thread-block level GPU cache bypassing scheme, which achieves 1.68x speedup on average on a set of memory-intensive benchmarks over the baseline. Compared to prior works, our scheme achieves 21.6% performance improvement over SWL-best [29] and 11.9% over CBWT-best [4] on average.}, journal={2016 ACM/EDAC/IEEE DESIGN AUTOMATION CONFERENCE (DAC)}, author={Dai, Hongwen and Li, Chao and Zhou, Huiyang and Gupta, Saurabh and Kartsaklis, Christos and Mantor, Mike}, year={2016} } @inproceedings{lin_nyland_huiyang_2016, title={Enabling efficient preemption for SIMT architectures with lightweight context switching}, DOI={10.1109/sc.2016.76}, abstractNote={Context switching is a key technique enabling preemption and time-multiplexing for CPUs. However, for single-instruction multiple-thread (SIMT) processors such as high-end graphics processing units (GPUs), it is challenging to support context switching due to the massive number of threads, which leads to a huge amount of architectural states to be swapped during context switching. The architectural state of SIMT processors includes registers, shared memory, SIMT stacks and barrier states. Recent works present thread-block-level preemption on SIMT processors to avoid context switching overhead. However, because the execution time of a thread block (TB) is highly dependent on the kernel program. The response time of preemption cannot be guaranteed and some TB-level preemption techniques cannot be applied to all kernel functions. In this paper, we propose three complementary ways to reduce and compress the architectural states to achieve lightweight context switching on SIMT processors. Experiments show that our approaches can reduce the register context size by 91.5% on average. Based on lightweight context switching, we enable instruction-level preemption on SIMT processors with compiler and hardware co-design. With our proposed schemes, the preemption latency is reduced by 59.7% on average compared to the naive approach.}, booktitle={SC '16: Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis}, author={Lin, Zhen and Nyland, L. and Huiyang}, year={2016}, pages={898–908} } @inproceedings{chen_huiyang_shen_gahm_venkat_booth_marshall_2016, title={Opencl-based erasure coding on heterogeneous architectures}, volume={7}, DOI={10.1109/asap.2016.7760770}, abstractNote={Erasure coding, Reed-Solomon coding in particular, is a key technique to deal with failures in scale-out storage systems. However, due to the algorithmic complexity, the performance overhead of erasure coding can become a significant bottleneck in storage systems attempting to meet service level agreements (SLAs). Previous work has mainly leveraged SIMD (single-instruction multiple-data) instruction extensions in general purpose processors to improve the processing throughput. In this work, we exploit state-of-art heterogeneous architectures, including GPUs, APUs, and FPGAs, to accelerate erasure coding. We leverage the OpenCL framework for our target heterogeneous architectures and propose code optimizations for each target architecture. Given their different hardware characteristics, we highlight the different optimization strategies for each of the target architectures. Using the throughput metric as the ratio of the input file size over the processing latency, we achieve 2.84 GB/s on a 28-core Xeon CPU, 3.90 GB/s on an NVIDIA K40m GPU, 0.56 GB/s on an AMD Carrizo APU, and 1.19 GB/s (5.35 GB/s if only considering the kernel execution latency) on an Altera Stratix V FPGA, when processing a 836.9MB zipped file with a 30×33 encoding matrix. In comparison, the single-thread code using the Intel's ISA-L library running on the Xeon CPU has the throughput of 0.13 GB/s.}, booktitle={Ieee international conference on application-specific systems}, publisher={IEEE}, author={Chen, G. Y. and Huiyang and Shen, Xipeng and Gahm, J. and Venkat, N. and Booth, S. and Marshall, J.}, year={2016}, pages={33–40} } @inproceedings{li_yang_feng_chakradhar_huiyang_2016, title={Optimizing memory efficiency for deep convolutional neural networks on GPUs}, DOI={10.1109/sc.2016.53}, abstractNote={Leveraging large data sets, deep Convolutional Neural Networks (CNNs) achieve state-of-the-art recognition accuracy. Due to the substantial compute and memory operations, however, they require significant execution time. The massive parallel computing capability of GPUs make them as one of the ideal platforms to accelerate CNNs and a number of GPU-based CNN libraries have been developed. While existing works mainly focus on the computational efficiency of CNNs, the memory efficiency of CNNs have been largely overlooked. Yet CNNs have intricate data structures and their memory behavior can have significant impact on the performance. In this work, we study the memory efficiency of various CNN layers and reveal the performance implication from both data layouts and memory access patterns. Experiments show the universal effect of our proposed optimizations on both single layers and various networks, with up to 27.9× for a single layer and up to 5.6× on the whole networks.}, booktitle={SC '16: Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis}, author={Li, C. and Yang, Y. and Feng, M. and Chakradhar, S. and Huiyang}, year={2016}, pages={633–644} } @inproceedings{zhao_wang_lin_zhou_zheng_2016, place={Los Alamitos, CA}, title={Selective GPU Cache Bypassing for Un-Coalesced Loads}, DOI={10.1109/ICPADS.2016.0122}, abstractNote={GPUs are widely used to accelerate general purpose applications, and could hide memory latency through massive multithreading. But multithreading can increase contention for the L1 data caches (L1D). This problem is exacerbated when an application contains irregular memory references which would lead to un-coalesced memory accesses. In this paper, we propose a simple yet effective GPU cache Bypassing scheme for Un-Coalesced Loads (BUCL). BUCL makes bypassing decisions at two granularities. At the instruction-level, when the number of memory accesses generated by a non-coalesced load instruction is bigger than a threshold, referred as the threshold of un-coalescing degree (TUCD), all the accesses generated from this load will bypass L1D. The reason is that the cache data filled by un-coalesced loads typically have low probabilities to be reused. At the level of each individual memory access, when the L1D is stalled, the accessed data is likely with low locality, and the utilization of the target memory sub-partition is not high, this memory access may also bypass L1D. Our experiments show that BUCL achieves 36% and 5% performance improvement over the baseline GPU for memory un-coalesced and memory coherent benchmarks, respectively, and also significantly outperforms prior GPU cache bypassing and warp throttling schemes.}, booktitle={22nd IEEE International Conference on Parallel and Distributed Systems : ICPADS 2016 : proceedings : 13-16 December 2016, Wuhan, Hubei, China}, publisher={IEEE Computer Society}, author={Zhao, C. and Wang, F. and Lin, Z. and Zhou, H. and Zheng, N.}, editor={Liao, XiaofeiEditor}, year={2016} } @inproceedings{jia_huiyang_2016, title={Tuning stencil codes in opencl for fpgas}, DOI={10.1109/iccd.2016.7753287}, abstractNote={OpenCL is designed as a parallel programming framework to support heterogeneous computing platforms. The implicit or explicit parallelism in OpenCL kernel code enables efficient FPGA implementation from a high-level programming abstraction. However, FPGA architecture is completely different from GPU architecture, for which OpenCL is widely used. Tuning OpenCL codes to achieve high performance on FPGAs is an open problem and the existing OpenCL tools and optimizations proposed for CPUs/GPUs may not be directly applicable to FPGAs. In this paper, we explore OpenCL code optimizations for stencil computations on FPGAs. We propose tuning processes for stencil kernels in both the Single-Task and NDRange modes. Our optimized 1D convolution, 2D convolution and 2D Jacobi iteration kernels can achieve up to two orders of magnitude performance improvement over the naïve kernels. Also, compared to Altera design examples our optimized kernels achieve 7.1× and 3.5× speedups for the Sobel and Time-Domain FIR Filter, respectively. This study also includes benchmarking of the FPGA memory system, revealing how code patterns affect the performance of different types of memory on FPGAs.}, booktitle={Proceedings of the 34th ieee international conference on computer design (iccd)}, author={Jia, Q. and Huiyang}, year={2016}, pages={249–256} } @inproceedings{jia_padia_amboju_zhou_2015, title={An Optimized AMPM-based Prefetcher Coupled with Configurable Cache Line Sizing}, booktitle={JILP Workshop on Computer Architecture Competitions (JWAC): 2nd Data Prefetching Championship (DPC2)}, author={Jia, Q. and Padia, M.B. and Amboju, K. and Zhou, H.}, year={2015}, month={Jun} } @inproceedings{mayank_dai_wei_huiyang_2015, title={Analyzing graphics processor unit (GPU) instruction set architectures}, DOI={10.1109/ispass.2015.7095794}, abstractNote={Because of their high throughput and power efficiency, massively parallel architectures like graphics processing units (GPUs) become a popular platform for generous purpose computing. However, there are few studies and analyses on GPU instruction set architectures (ISAs) although it is wellknown that the ISA is a fundamental design issue of all modern processors including GPUs.}, booktitle={Ieee international symposium on performance analysis of systems and}, author={Mayank, K. and Dai, H. W. and Wei, J. Z. and Huiyang}, year={2015}, pages={155–156} } @inproceedings{li_yang_lin_huiyang_2015, title={Automatic data placement into GPU on-chip memory resources}, DOI={10.1109/cgo.2015.7054184}, abstractNote={Although graphics processing units (GPUs) rely on thread-level parallelism to hide long off-chip memory access latency, judicious utilization of on-chip memory resources, including register files, shared memory, and data caches, is critical to application performance. However, explicitly managing GPU on-chip memory resources is a non-trivial task for application developers. More importantly, as on-chip memory resources vary among different GPU generations, performance portability has become a daunting challenge. In this paper, we tackle this problem with compiler-driven automatic data placement. We focus on programs that have already been reasonably optimized either manually by programmers or automatically by compiler tools. Our proposed compiler algorithms refine these programs by revising data placement across different types of GPU on-chip resources to achieve both performance enhancement and performance portability. Among 12 benchmarks in our study, our proposed compiler algorithm improves the performance by 1.76× on average on Nvidia GTX480, and by 1.61× on average on GTX680.}, booktitle={2015 IEEE/ACM International Symposium on Code Generation and Optimization (CGO)}, author={Li, C. and Yang, Y. and Lin, Zhen and Huiyang}, year={2015}, pages={23–33} } @article{yang_li_zhou_2015, title={CUDA-NP: Realizing Nested Thread-Level Parallelism in GPGPU Applications}, volume={30}, ISSN={["1860-4749"]}, DOI={10.1007/s11390-015-1500-y}, abstractNote={Parallel programs consist of series of code sections with different thread-level parallelism (TLP). As a result, it is rather common that a thread in a parallel program, such as a GPU kernel in CUDA programs, still contains both sequential code and parallel loops. In order to leverage such parallel loops, the latest NVIDIA Kepler architecture introduces dynamic parallelism, which allows a GPU thread to start another GPU kernel, thereby reducing the overhead of launching kernels from a CPU. However, with dynamic parallelism, a parent thread can only communicate with its child threads through global memory and the overhead of launching GPU kernels is non-trivial even within GPUs. In this paper, we first study a set of GPGPU benchmarks that contain parallel loops, and highlight that these benchmarks do not have a very high loop count or high degree of TLP. Consequently, the benefits of leveraging such parallel loops using dynamic parallelism are too limited to offset its overhead. We then present our proposed solution to exploit nested parallelism in CUDA, referred to as CUDA-NP. With CUDA-NP, we initially enable a high number of threads when a GPU program starts, and use control flow to activate different numbers of threads for different code sections. We implement our proposed CUDA-NP framework using a directive-based compiler approach. For a GPU kernel, an application developer only needs to add OpenMP-like pragmas for parallelizable code sections. Then, our CUDA-NP compiler automatically generates the optimized GPU kernels. It supports both the reduction and the scan primitives, explores different ways to distribute parallel loop iterations into threads, and efficiently manages on-chip resource. Our experiments show that for a set of GPGPU benchmarks, which have already been optimized and contain nested parallelism, our proposed CUDA-NP framework further improves the performance by up to 6.69 times and 2.01 times on average.}, number={1}, journal={JOURNAL OF COMPUTER SCIENCE AND TECHNOLOGY}, author={Yang, Yi and Li, Chao and Zhou, Huiyang}, year={2015}, month={Jan}, pages={3–19} } @inproceedings{li_song_dai_sidelnik_hari_zhou_2015, place={New York}, title={Locality-Driven Dynamic GPU Cache Bypassing}, DOI={10.1145/2751205.2751237}, abstractNote={This paper presents novel cache optimizations for massively parallel, throughput-oriented architectures like GPUs. L1 data caches (L1 D-caches) are critical resources for providing high-bandwidth and low-latency data accesses. However, the high number of simultaneous requests from single-instruction multiple-thread (SIMT) cores makes the limited capacity of L1 D-caches a performance and energy bottleneck, especially for memory-intensive applications. We observe that the memory access streams to L1 D-caches for many applications contain a significant amount of requests with low reuse, which greatly reduce the cache efficacy. Existing GPU cache management schemes are either based on conditional/reactive solutions or hit-rate based designs specifically developed for CPU last level caches, which can limit overall performance. To overcome these challenges, we propose an efficient locality monitoring mechanism to dynamically filter the access stream on cache insertion such that only the data with high reuse and short reuse distances are stored in the L1 D-cache. Specifically, we present a design that integrates locality filtering based on reuse characteristics of GPU workloads into the decoupled tag store of the existing L1 D-cache through simple and cost-effective hardware extensions. Results show that our proposed design can dramatically reduce cache contention and achieve up to 56.8% and an average of 30.3% performance improvement over the baseline architecture, for a range of highly-optimized cache-unfriendly applications with minor area overhead and better energy efficiency. Our design also significantly outperforms the state-of-the-art CPU and GPU bypassing schemes (especially for irregular applications), without generating extra L2 and DRAM level contention.}, booktitle={ICS '15: Proceedings of the 29th ACM on International Conference on Supercomputing}, publisher={ACM}, author={Li, C. and Song, S. and Dai, H. and Sidelnik, A. and Hari, S. and Zhou, H.}, year={2015}, month={Jun}, pages={61–77} } @inproceedings{xiang_yang_mantor_rubin_zhou_2015, place={New Jersey}, title={Revisiting ILP Designs for Throughput-Oriented GPGPU Architecture}, ISBN={9781479980062}, DOI={10.1109/CCGrid.2015.14}, abstractNote={Many-core architectures such as graphics processing units (GPUs) rely on thread-level parallelism (TLP)to overcome pipeline hazards. Consequently, each core in a many-core processor employs a relatively simple in-order pipeline with limited capability to exploit instruction-level parallelism (ILP). In this paper, we study the ILP impact on the throughput-oriented many-core architecture, including data bypassing, score boarding and branch prediction. We show that these ILP techniques significantly reduce the performance dependency on TLP. This is especially useful for applications, whose resource usage limits the hardware to run a high number of threads concurrently. Furthermore, ILP techniques reduce the demand on on-chip resource to support high TLP. Given the workload-dependent impact from ILP, we propose heterogeneous GPGPU architecture, consisting of both the cores designed for high TLP and those customized with ILPtechniques. Our results show that our heterogeneous GPUarchitecture achieves high throughput as well as high energy and area-efficiency compared to homogenous designs.}, booktitle={Proceedings of the 2015 15th IEEE/ACM International Symposium on Cluster, Cloud and Grid Computing}, publisher={IEEE}, author={Xiang, P. and Yang, Y. and Mantor, M. and Rubin, N. and Zhou, H.}, year={2015}, month={May}, pages={121–130} } @article{gupta_zhou_2015, title={Spatial Locality-Aware Cache Partitioning for Effective Cache Sharing}, ISSN={["0190-3918"]}, DOI={10.1109/icpp.2015.24}, abstractNote={In modern multi-core processors, last-level caches (LLCs) are typically shared among multiple cores. Previous works have shown that such sharing is beneficial as different workloads have different needs for cache capacity, and logical partitioning of capacity can improve system performance. However, what is missing in previous works on partitioning shared LLCs is that the heterogeneity in spatial locality among workloads has not been explored. In other words, all the cores use the same block/line size in shared LLCs. In this work, we highlight that exploiting spatial locality enables much more effective cache sharing. The fundamental reason is that for many memory intensive workloads, their cache capacity requirements can be drastically reduced when a large block size is employed, therefore they can effectively donate more capacity to other workloads. To leverage spatial locality for cache partitioning effectively, we first propose a simple yet effective mechanism to measure both spatial and temporal locality at run-time. The locality information is then used to determine both the proper block size and the capacity assigned to each workload. Our experiments show that our Spatial Locality-aware Cache Partitioning (SLCP) significantly outperforms the previous works. We also present several case studies that dissect the effectiveness of SLCP compared to the existing approaches.}, journal={2015 44TH INTERNATIONAL CONFERENCE ON PARALLEL PROCESSING (ICPP)}, author={Gupta, Saurabh and Zhou, Huiyang}, year={2015}, pages={150–159} } @inproceedings{yang_xiang_mantor_rubin_hsu_dong_zhou_2014, place={New Jersey}, title={A Case for a Flexible Scalar Unit in SIMT Architecture}, ISBN={9781479938001 9781479937998}, DOI={10.1109/IPDPS.2014.21}, abstractNote={The wide availability and the Single-Instruction Multiple-Thread (SIMT)-style programming model have made graphics processing units (GPUs) a promising choice for high performance computing. However, because of the SIMT style processing, an instruction will be executed in every thread even if the operands are identical for all the threads. To overcome this inefficiency, the AMD's latest Graphics Core Next (GCN) architecture integrates a scalar unit into a SIMT unit. In GCN, both the SIMT unit and the scalar unit share a single SIMT style instruction stream. Depending on its type, an instruction is issued to either a scalar or a SIMT unit. In this paper, we propose to extend the scalar unit so that it can either share the instruction stream with the SIMT unit or execute a separate instruction stream. The program to be executed by the scalar unit is referred to as a scalar program and its purpose is to assist SIMT-unit execution. The scalar programs are either generated from SIMT programs automatically by the compiler or manually developed by expert developers. We make a case for our proposed flexible scalar unit through three collaborative execution paradigms: data prefetching, control divergence elimination, and scalar-workload extraction. Our experimental results show that significant performance gains can be achieved using our proposed approaches compared to the state-of-art SIMT style processing.}, booktitle={Proceedings of 2014 IEEE 28th International Parallel and Distributed Processing Symposium}, publisher={IEEE}, author={Yang, Y. and Xiang, P. and Mantor, M. and Rubin, N. and Hsu, L. and Dong, Q. and Zhou, H.}, year={2014}, month={May} } @inbook{yang_zhou_2014, title={A Highly Efficient FFT Using Shared-Memory Multiplexing}, ISBN={9783319065472 9783319065489}, url={http://dx.doi.org/10.1007/978-3-319-06548-9_17}, DOI={10.1007/978-3-319-06548-9_17}, booktitle={Numerical Computations with GPUs}, publisher={Springer International Publishing}, author={Yang, Yi and Zhou, Huiyang}, year={2014}, pages={363–377} } @article{yang_zhou_2014, title={CUDA-NP: Realizing Nested Thread-Level Parallelism in GPGPU Applications}, volume={49}, ISSN={["1558-1160"]}, DOI={10.1145/2692916.2555254}, abstractNote={Parallel programs consist of series of code sections with different thread-level parallelism (TLP). As a result, it is rather common that a thread in a parallel program, such as a GPU kernel in CUDA programs, still contains both se-quential code and parallel loops. In order to leverage such parallel loops, the latest Nvidia Kepler architecture intro-duces dynamic parallelism, which allows a GPU thread to start another GPU kernel, thereby reducing the overhead of launching kernels from a CPU. However, with dynamic parallelism, a parent thread can only communicate with its child threads through global memory and the overhead of launching GPU kernels is non-trivial even within GPUs. In this paper, we first study a set of GPGPU benchmarks that contain parallel loops, and highlight that these bench-marks do not have a very high loop count or high degrees of TLP. Consequently, the benefits of leveraging such par-allel loops using dynamic parallelism are too limited to offset its overhead. We then present our proposed solution to exploit nested parallelism in CUDA, referred to as CUDA-NP. With CUDA-NP, we initially enable a high number of threads when a GPU program starts, and use control flow to activate different numbers of threads for different code sections. We implemented our proposed CUDA-NP framework using a directive-based compiler approach. For a GPU kernel, an application developer only needs to add OpenMP-like pragmas for parallelizable code sections. Then, our CUDA-NP compiler automatically gen-erates the optimized GPU kernels. It supports both the reduction and the scan primitives, explores different ways to distribute parallel loop iterations into threads, and effi-ciently manages on-chip resource. Our experiments show that for a set of GPGPU benchmarks, which have already been optimized and contain nested parallelism, our pro-posed CUDA-NP framework further improves the perfor-mance by up to 6.69 times and 2.18 times on average.}, number={8}, journal={ACM SIGPLAN NOTICES}, author={Yang, Yi and Zhou, Huiyang}, year={2014}, month={Aug}, pages={93–105} } @inproceedings{li_yang_dai_yan_mueller_zhou_2014, title={Understanding the tradeoffs between software-managed vs. hardware-managed caches in GPUs}, booktitle={Ieee international symposium on performance analysis of systems and}, author={Li, C. and Yang, Y. and Dai, H. W. and Yan, S. G. and Mueller, F. and Zhou, H. Y.}, year={2014}, pages={231–241} } @inproceedings{xiang_yang_huiyang_2014, title={Warp-level divergence in GPUs: Characterization, impact, and mitigation}, DOI={10.1109/hpca.2014.6835939}, abstractNote={High throughput architectures rely on high thread-level parallelism (TLP) to hide execution latencies. In state-of-art graphics processing units (GPUs), threads are organized in a grid of thread blocks (TBs) and each TB contains tens to hundreds of threads. With a TB-level resource management scheme, all the resource required by a TB is allocated/released when it is dispatched to / finished in a streaming multiprocessor (SM). In this paper, we highlight that such TB-level resource management can severely affect the TLP that may be achieved in the hardware. First, different warps in a TB may finish at different times, which we refer to as `warp-level divergence'. Due to TB-level resource management, the resources allocated to early finished warps are essentially wasted as they need to wait for the longest running warp in the same TB to finish. Second, TB-level management can lead to resource fragmentation. For example, the maximum number of threads to run on an SM in an NVIDIA GTX 480 GPU is 1536. For an application with a TB containing 1024 threads, only 1 TB can run on the SM even though it has sufficient resource for a few hundreds more threads. To overcome these inefficiencies, we propose to allocate and release resources at the warp level. Warps are dispatched to an SM as long as it has sufficient resource for a warp rather than a TB. Furthermore, whenever a warp is completed, its resource is released and can accommodate a new warp. This way, we effectively increase the number of active warps without actually increasing the size of critical resources. We present our lightweight architectural support for our proposed warp-level resource management. The experimental results show that our approach achieves up to 76.0% and an average of 16.0% performance gains and up to 21.7% and an average of 6.7% energy savings at minor hardware overhead.}, booktitle={International symposium on high-performance computer}, author={Xiang, P. and Yang, Y. and Huiyang}, year={2014}, pages={284–295} } @inproceedings{yan_li_zhang_zhou_2014, place={New York}, title={yaSpM: Yet Another SpMV Framework on GPUs}, volume={49}, ISSN={["1558-1160"]}, DOI={10.1145/2692916.2555255}, abstractNote={SpMV is a key linear algebra algorithm and has been widely used in many important application domains. As a result, numerous attempts have been made to optimize SpMV on GPUs to leverage their massive computational throughput. Although the previous work has shown impressive progress, load imbalance and high memory bandwidth remain the critical performance bottlenecks for SpMV. In this paper, we present our novel solutions to these problems. First, we devise a new SpMV format, called blocked compressed common coordinate (BCCOO), which uses bit flags to store the row indices in a blocked common coordinate (COO) format so as to alleviate the bandwidth problem. We further improve this format by partitioning the matrix into vertical slices to enhance the cache hit rates when accessing the vector to be multiplied. Second, we revisit the segmented scan approach for SpMV to address the load imbalance problem. We propose a highly efficient matrix-based segmented sum/scan for SpMV and further improve it by eliminating global synchronization. Then, we introduce an auto-tuning framework to choose optimization parameters based on the characteristics of input sparse matrices and target hardware platforms. Our experimental results on GTX680 GPUs and GTX480 GPUs show that our proposed framework achieves significant performance improvement over the vendor tuned CUSPARSE V5.0 (up to 229% and 65% on average on GTX680 GPUs, up to 150% and 42% on average on GTX480 GPUs) and some most recently proposed schemes (e.g., up to 195% and 70% on average over clSpMV on GTX680 GPUs, up to 162% and 40% on average over clSpMV on GTX480 GPUs).}, number={8}, booktitle={Proceedings of the 19th ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming}, publisher={Association for Computing Machinery}, author={Yan, S. and Li, C. and Zhang, Y. and Zhou, H.}, year={2014}, month={Feb}, pages={107–118} } @article{gupta_gao_zhou_2013, title={Adaptive Cache Bypassing for Inclusive Last Level Caches}, ISSN={["1530-2075"]}, DOI={10.1109/ipdps.2013.16}, abstractNote={Cache hierarchy designs, including bypassing, replacement, and the inclusion property, have significant performance impact. Recent works on high performance caches have shown that cache bypassing is an effective technique to enhance the last level cache (LLC) performance. However, commonly used inclusive cache hierarchy cannot benefit from this technique because bypassing inherently breaks the inclusion property. This paper presents a solution to enabling cache bypassing for inclusive caches. We introduce a bypass buffer to an LLC. Bypassed cache lines skip the LLC while their tags are stored in this bypass buffer. When a tag is evicted from the bypass buffer, it invalidates the corresponding cache lines in upper level caches to ensure the inclusion property. Our key insight is that the lifetime of a bypassed line, assuming a well-designed bypassing algorithm, should be short in upper level caches and is most likely dead when its tag is evicted from the bypass buffer. Therefore, a small bypass buffer is sufficient to maintain the inclusion property and to reap most performance benefits of bypassing. Furthermore, the bypass buffer facilitates bypassing algorithms by providing the usage information of bypassed lines. We show that a top performing cache bypassing algorithm, which is originally designed for non-inclusive caches, performs comparably for inclusive caches equipped with our bypass buffer. The usage information collected from the bypass buffer also significantly reduces the cost of hardware implementation compared to the original design.}, journal={IEEE 27TH INTERNATIONAL PARALLEL AND DISTRIBUTED PROCESSING SYMPOSIUM (IPDPS 2013)}, author={Gupta, Saurabh and Gao, Hongliang and Zhou, Huiyang}, year={2013}, pages={1243–1253} } @article{gupta_xiang_zhou_2013, place={New York}, title={Analyzing locality of memory references in GPU architectures}, volume={6}, DOI={10.1145/2492408.2492423}, abstractNote={In this paper we advocate formal locality analysis on memory references of GPGPU kernels. We investigate the locality of reference at different cache levels in the memory hierarchy. At the L1 cache level, we look into the locality behavior at the warp-, the thread block- and the streaming multiprocessor-level. Using matrix multiplication as a case study, we show that our locality analysis accurately captures some interesting and counter-intuitive behavior of the memory accesses. We believe that such analysis will provide very useful insights in understanding the memory accessing behavior and optimizing the memory hierarchy in GPU architectures.}, journal={MSPC '13: Proceedings of the ACM SIGPLAN Workshop on Memory Systems Performance and Correctness}, publisher={Association for Computing Machinery}, author={Gupta, S. and Xiang, P. and Zhou, H.}, year={2013}, month={Jun} } @article{kong_aciicmez_seifert_zhou_2013, title={Architecting against Software Cache-Based Side-Channel Attacks}, volume={62}, ISSN={["0018-9340"]}, DOI={10.1109/tc.2012.78}, abstractNote={Using cache-like architectural components including data caches, instruction caches, or branch target buffers as a side channel, software cache-based side-channel attacks are able to derive secret keys used in cryptographic operations through legitimate software activities. Existing software solutions are typically application specific and incur substantial performance overhead. Recent hardware proposals against attacks on data caches, although effective in reducing performance overhead, may still be vulnerable to advanced attacks. Furthermore, efficient defenses against attacks on other cache structures, including instruction caches and branch target buffers, are missing. In this paper, we propose hardware-software integrated approaches to defend against software cache-based attacks comprehensively. For attacks on data caches, we propose to use preloading, informing loads, and informing loads with software random permutation to secure the partition-locked cache (PLcache), the random permutation (RPcache) and regular caches, respectively. These approaches present different tradeoffs between hardware complexity and performance overhead. To defend against attacks on instruction caches, we show that the PLcache with preloading and the RPcache provide good protection. To defend against attacks based on branch target buffers, we propose to adopt a new update policy to eliminate potential information leaking. Our experiments show that the proposed schemes not only provide strong security protection but also incur small performance overhead.}, number={7}, journal={IEEE TRANSACTIONS ON COMPUTERS}, author={Kong, Jingfei and Aciicmez, Onur and Seifert, Jean-Pierre and Zhou, Huiyang}, year={2013}, month={Jul}, pages={1276–1288} } @inproceedings{xiang_yang_mantor_rubin_hsu_zhou_2013, place={New York}, title={Exploiting Uniform Vector Instructions for GPGPU Performance, Energy Efficiency, and Opportunistic Reliability Enhancement}, DOI={10.1145/2464996.2465022}, abstractNote={State-of-art graphics processing units (GPUs) employ the single-instruction multiple-data (SIMD) style execution to achieve both high computational throughput and energy efficiency. As previous works have shown, there exists significant computational redundancy in SIMD execution, where different execution lanes operate on the same operand values. Such value locality is referred to as uniform vectors. In this paper, we first show that besides redundancy within a uniform vector, different vectors can also have the identical values. Then, we propose detailed architecture designs to exploit both types of redundancy. For redundancy within a uniform vector, we propose to either extend the vector register file with token bits or add a separate small scalar register file to eliminate redundant computations as well as redundant data storage. For redundancy across different uniform vectors, we adopt instruction reuse, proposed originally for CPU architectures, to detect and eliminate redundancy. The elimination of redundant computations and data storage leads to both significant energy savings and performance improvement. Furthermore, we propose to leverage such redundancy to protect arithmetic-logic units (ALUs) and register files against hardware errors. Our detailed evaluation shows that our proposed design has low hardware overhead and achieves performance gains, up to 23.9% and 12.0% on average, along with energy savings, up to 24.8% and 12.6% on average, as well as a 21.1% and 14.1% protection coverage for ALUs and register files, respectively.}, booktitle={Proceedings of the 27th International ACM Conference on International Conference on Supercomputing}, publisher={Association for Computing Machinery}, author={Xiang, P. and Yang, Y. and Mantor, M. and Rubin, N. and Hsu, L. and Zhou, H.}, year={2013}, month={Jun}, pages={433–442} } @article{gupta_xiang_yang_zhou_2013, title={Locality principle revisited: A probability-based quantitative approach}, volume={73}, ISSN={["1096-0848"]}, DOI={10.1016/j.jpdc.2013.01.010}, abstractNote={This paper revisits the fundamental concept of the locality of references and proposes to quantify it as a conditional probability: in an address stream, given the condition that an address is accessed, how likely the same address (temporal locality) or an address within its neighborhood (spatial locality) will be accessed in the near future. Previous works use reuse distance histograms as a measure of temporal locality. For spatial locality, some ad hoc metrics have been proposed as a quantitative measure. In contrast, our conditional probability-based locality measure has a clear mathematical meaning and provides a theoretically sound and unified way to quantify both temporal and spatial locality. We showcase that our quantified locality measure can be used to evaluate compiler optimizations, to analyze the locality at different levels of memory hierarchy, to optimize the cache architecture to effectively leverage the locality, and to examine the effect of data prefetching mechanisms.}, number={7}, journal={JOURNAL OF PARALLEL AND DISTRIBUTED COMPUTING}, author={Gupta, Saurabh and Xiang, Ping and Yang, Yi and Zhou, Huiyang}, year={2013}, month={Jul}, pages={1011–1027} } @article{yang_zhou_2013, title={The Implementation of a High Performance GPGPU Compiler}, volume={41}, ISSN={["1573-7640"]}, DOI={10.1007/s10766-012-0228-3}, number={6}, journal={INTERNATIONAL JOURNAL OF PARALLEL PROGRAMMING}, author={Yang, Yi and Zhou, Huiyang}, year={2013}, month={Dec}, pages={768–781} } @article{yang_xiang_kong_mantor_zhou_2012, title={A Unified Optimizing Compiler Framework for Different GPGPU Architectures}, volume={9}, ISSN={["1544-3973"]}, DOI={10.1145/2207222.2207225}, abstractNote={This article presents a novel optimizing compiler for general purpose computation on graphics processing units (GPGPU). It addresses two major challenges of developing high performance GPGPU programs: effective utilization of GPU memory hierarchy and judicious management of parallelism. The input to our compiler is a naïve GPU kernel function, which is functionally correct but without any consideration for performance optimization. The compiler generates two kernels, one optimized for global memories and the other for texture memories. The proposed compilation process is effective for both AMD/ATI and NVIDIA GPUs. The experiments show that our optimized code achieves very high performance, either superior or very close to highly fine-tuned libraries.}, number={2}, journal={ACM TRANSACTIONS ON ARCHITECTURE AND CODE OPTIMIZATION}, author={Yang, Yi and Xiang, Ping and Kong, Jingfei and Mantor, Mike and Zhou, Huiyang}, year={2012}, month={Jun} } @inproceedings{yang_xiang_mantor_zhou_2012, title={CPU-assisted GPGPU on fused CPU-GPU architectures}, booktitle={International symposium on high-performance computer}, author={Yang, Y. and Xiang, P. and Mantor, M. and Zhou, H. Y.}, year={2012}, pages={103–114} } @inproceedings{yang_xiang_mantor_zhou_2012, title={Fixing Performance Bugs: An Empirical Study of Open-Source GPGPU Programs}, ISBN={9781467325080 9780769547961}, url={http://dx.doi.org/10.1109/icpp.2012.30}, DOI={10.1109/icpp.2012.30}, abstractNote={Given the extraordinary computational power of modern graphics processing units (GPUs), general purpose computation on GPUs (GPGPU) has become an increasingly important platform for high performance computing. To better understand how well the GPU resource has been utilized by application developers and then to facilitate them to develop high performance GPGPU code, we conduct an empirical study on GPGPU programs from ten open-source projects. These projects span a wide range of disciplines and many are designed as high performance libraries. Among these projects, we found various performance 'bugs', i.e., code segments leading to inefficient use of GPU hardware. We characterize these performance bugs, and propose the bug fixes. Our experiments confirm both significant performance gains and energy savings from our fixes and reveal interesting insights on different GPUs.}, booktitle={2012 41st International Conference on Parallel Processing}, publisher={IEEE}, author={Yang, Yi and Xiang, Ping and Mantor, Mike and Zhou, Huiyang}, year={2012}, month={Sep} } @inproceedings{gupta_xiang_yang_huiyang_2012, title={Locality principle revisited: A probability-based quantitative approach}, DOI={10.1109/ipdps.2012.93}, abstractNote={This paper revisits the fundamental concept of the locality of references and proposes to quantify it as a conditional probability: in an address stream, given the condition that an address is accessed, how likely the same address (temporal locality) or an address within its neighborhood (spatial locality) will be accessed in the near future. Based on this definition, spatial locality is a function of two parameters, the neighborhood size and the scope of near future, and can be visualized with a 3D mesh. Temporal locality becomes a special case of spatial locality with the neighborhood size being zero byte. Previous works on locality analysis use stack/reuse distances to compute distance histograms as a measure of temporal locality. For spatial locality, some ad-hoc metrics have been proposed as a quantitative measure. In contrast, our conditional probability-based locality measure has a clear mathematical meaning, offers justification for distance histograms, and provides a theoretically sound and unified way to quantify both temporal and spatial locality. The proposed locality measure clearly exhibits the inherent application characteristics, from which we can easily derive information such as the sizes of the working data sets and how locality can be exploited. We showcase that our quantified locality visualized in 3D-meshes can be used to evaluate compiler optimizations, to analyze the locality at different levels of memory hierarchy, to optimize the cache architecture to effectively leverage the locality, and to examine the effect of data prefetching mechanisms. A GPU-based parallel algorithm is also presented to accelerate the locality computation for large address traces.}, booktitle={2012 ieee 26th international parallel and distributed processing symposium (ipdps)}, author={Gupta, S. and Xiang, P. and Yang, Y. and Huiyang}, year={2012}, pages={995–1009} } @inproceedings{yang_xiang_mantor_rubin_zhou_2012, title={Shared Memory Multiplexing: A Novel Way to Improve GPGPU Throughput}, ISBN={9781509066094 9781450311823}, booktitle={Proceedings of the 2012 21st International Conference on Parallel Architectures and Compilation Techniques (PACT)}, author={Yang, Y. and Xiang, P. and Mantor, M. and Rubin, N. and Zhou, H.}, year={2012}, month={Sep} } @article{dimitrov_zhou_2011, title={Combining Local and Global History for High Performance Data Prefetching}, volume={13}, journal={Journal of Instruction-Level Parallelism (JILP)}, author={Dimitrov, M. and Zhou, H.}, year={2011}, month={Feb}, pages={1–14} } @inproceedings{yang_zhou_2011, title={Developing a High Performance GPGPU Compiler using Cetus}, booktitle={Proceedings of the Cetus Users and Compiler Infrastructure Workshop, International Conference on Parallel Architectures and Compilation Techniques (PACT’11)}, author={Yang, Y. and Zhou, H.}, year={2011}, month={Oct} } @article{bhansali_panirwla_zhou_2011, title={Exploring Correlation for Indirect Branch Prediction}, journal={2nd JILP Workshop on Computer Architecture Competitions (JWAC-2): Championship Branch Prediction}, author={Bhansali, N. and Panirwla, C. and Zhou, H.}, year={2011}, month={Jun} } @inproceedings{dimitrov_zhou_2011, title={Time-Ordered Event Traces: A New Debugging Primitive for Concurrency Bugs}, ISBN={9781612843728}, url={http://dx.doi.org/10.1109/ipdps.2011.38}, DOI={10.1109/ipdps.2011.38}, abstractNote={Non-determinism makes concurrent bugs extremely difficult to reproduce and to debug. In this work, we propose a new debugging primitive to facilitate the debugging process by exposing this non-deterministic behavior to the programmer. The key idea is to generate a time-ordered trace of events such as function calls/returns and memory accesses across different threads. The architectural support for this primitive is lightweight, including a high-precision, frequency-invariant time-stamp counter and an event trace buffer in each processor core. The proposed primitive continuously records and timestamps the last N function calls/returns per core by default, and can also be configured to watch specific memory addresses or code regions through a flexible software interface. To examine the effectiveness of the proposed primitive, we studied a variety of concurrent bugs in large commercial software and our results show that exposing the time-ordered information, function calls/returns in particular, to the programmer is highly beneficial for diagnosing the root causes of these bugs.}, booktitle={2011 IEEE International Parallel & Distributed Processing Symposium}, publisher={IEEE}, author={Dimitrov, Martin and Zhou, Huiyang}, year={2011}, month={May} } @article{yang_xiang_kong_zhou_2010, title={A GPGPU Compiler for Memory Optimization and Parallelism Management}, volume={45}, ISSN={["0362-1340"]}, DOI={10.1145/1809028.1806606}, abstractNote={This paper presents a novel optimizing compiler for general purpose computation on graphics processing units (GPGPU). It addresses two major challenges of developing high performance GPGPU programs: effective utilization of GPU memory hierarchy and judicious management of parallelism.}, number={6}, journal={ACM SIGPLAN NOTICES}, author={Yang, Yi and Xiang, Ping and Kong, Jingfei and Zhou, Huiyang}, year={2010}, month={Jun}, pages={86–97} } @inproceedings{kong_dimitrov_yang_liyanage_cao_staples_mantor_zhou_2010, place={New York}, title={Accelerating MATLAB Image Processing Toolbox Functions on GPUs}, ISBN={9781605589350}, DOI={10.1145/1735688.1735703}, abstractNote={In this paper, we present our effort in developing an open-source GPU (graphics processing units) code library for the MATLAB Image Processing Toolbox (IPT). We ported a dozen of representative functions from IPT and based on their inherent characteristics, we grouped these functions into four categories: data independent, data sharing, algorithm dependent and data dependent. For each category, we present a detailed case study, which reveals interesting insights on how to efficiently optimize the code for GPUs and highlight performance-critical hardware features, some of which have not been well explored in existing literature. Our results show drastic speedups for the functions in the data-independent or data-sharing category by leveraging hardware support judiciously; and moderate speedups for those in the algorithm-dependent category by careful algorithm selection and parallelization. For the functions in the last category, fine-grain synchronization and data-dependency requirements are the main obstacles to an efficient implementation on GPUs.}, booktitle={Proceedings of the 3rd Workshop on General-Purpose Computation on Graphics Processing Units}, publisher={Association for Computing Machinery}, author={Kong, J. and Dimitrov, M. and Yang, Y. and Liyanage, J. and Cao, L. and Staples, J. and Mantor, M. and Zhou, H.}, year={2010}, month={Mar}, pages={75–85} } @article{yang_xiang_kong_zhou_2010, title={An Optimizing Compiler for GPGPU Programs with Input-Data Sharing}, volume={45}, ISSN={["1558-1160"]}, DOI={10.1145/1837853.1693505}, abstractNote={Developing high performance GPGPU programs is challenging for application developers since the performance is dependent upon how well the code leverages the hardware features of specific graphics processors. To solve this problem and relieve application developers of low-level hardware-specific optimizations, we introduce a novel compiler to optimize GPGPU programs. Our compiler takes a naive GPU kernel function, which is functionally correct but without any consideration for performance optimization. The compiler then analyzes the code, identifies memory access patterns, and generates optimized code. The proposed compiler optimizations target at one category of scientific and media processing algorithms, which has the characteristics of input-data sharing when computing neighboring output pixels/elements. Many commonly used algorithms, such as matrix multiplication, convolution, etc., share such characteristics. For these algorithms, novel approaches are proposed to enforce memory coalescing and achieve effective data reuse. Data prefetching and hardware-specific tuning are also performed automatically with our compiler framework. The experimental results based on a set of applications show that our compiler achieves very high performance, either superior or very close to the highly fine-tuned library, NVIDIA CUBLAS 2.1.}, number={5}, journal={ACM SIGPLAN NOTICES}, author={Yang, Yi and Xiang, Ping and Kong, Jingfei and Zhou, Huiyang}, year={2010}, month={May}, pages={343–344} } @article{yang_xiang_kong_zhou_2010, title={An Optimizing Compiler for GPGPU Programs with Input-Data Sharing}, ISBN={["978-1-60558-708-0"]}, DOI={10.1145/1693453.1693505}, abstractNote={Developing high performance GPGPU programs is challenging for application developers since the performance is dependent upon how well the code leverages the hardware features of specific graphics processors. To solve this problem and relieve application developers of low-level hardware-specific optimizations, we introduce a novel compiler to optimize GPGPU programs. Our compiler takes a naive GPU kernel function, which is functionally correct but without any consideration for performance optimization. The compiler then analyzes the code, identifies memory access patterns, and generates optimized code. The proposed compiler optimizations target at one category of scientific and media processing algorithms, which has the characteristics of input-data sharing when computing neighboring output pixels/elements. Many commonly used algorithms, such as matrix multiplication, convolution, etc., share such characteristics. For these algorithms, novel approaches are proposed to enforce memory coalescing and achieve effective data reuse. Data prefetching and hardware-specific tuning are also performed automatically with our compiler framework. The experimental results based on a set of applications show that our compiler achieves very high performance, either superior or very close to the highly fine-tuned library, NVIDIA CUBLAS 2.1.}, journal={PPOPP 2010: PROCEEDINGS OF THE 2010 ACM SIGPLAN SYMPOSIUM ON PRINCIPLES AND PRACTICE OF PARALLEL PROGRAMMING}, author={Yang, Yi and Xiang, Ping and Kong, Jingfei and Zhou, Huiyang}, year={2010}, pages={343–344} } @inproceedings{kong_zhou_2010, title={Improving privacy and lifetime of PCM-based main memory}, ISBN={9781424475001}, url={http://dx.doi.org/10.1109/dsn.2010.5544298}, DOI={10.1109/dsn.2010.5544298}, abstractNote={Phase change memory (PCM) is a promising technology for computer memory systems. However, the non-volatile nature of PCM poses serious threats to computer privacy. The low programming endurance of PCM devices also limits the lifetime of PCM-based main memory (PRAM). In this paper, we first adopt counter-mode encryption for privacy protection and show that encryption significantly reduces the effectiveness of some previously proposed wear-leveling techniques for PRAM. To mitigate such adverse impact, we propose simple, yet effective extensions to the encryption scheme. In addition, we propose to reuse the encryption counters as age counters and to dynamically adjust the strength of error correction code (ECC) to extend the lifetime of PRAM. Our experiments show that our mechanisms effectively achieve privacy protection and lifetime extension for PRAM with very low performance overhead.}, booktitle={2010 IEEE/IFIP International Conference on Dependable Systems & Networks (DSN)}, publisher={IEEE}, author={Kong, Jingfei and Zhou, Huiyang}, year={2010}, month={Jun} } @inproceedings{dimitrov_zhou_2009, title={Anomaly-based bug prediction, isolation, and validation}, ISBN={9781605584065}, url={http://dx.doi.org/10.1145/1508244.1508252}, DOI={10.1145/1508244.1508252}, abstractNote={Software defects, commonly known as bugs, present a serious challenge for system reliability and dependability. Once a program failure is observed, the debugging activities to locate the defects are typically nontrivial and time consuming. In this paper, we propose a novel automated approach to pin-point the root-causes of software failures. Our proposed approach consists of three steps. The first step is bug prediction, which leverages the existing work on anomaly-based bug detection as exceptional behavior during program execution has been shown to frequently point to the root cause of a software failure. The second step is bug isolation, which eliminates false-positive bug predictions by checking whether the dynamic forward slices of bug predictions lead to the observed program failure. The last step is bug validation, in which the isolated anomalies are validated by dynamically nullifying their effects and observing if the program still fails. The whole bug prediction, isolation and validation process is fully automated and can be implemented with efficient architectural support. Our experiments with 6 programs and 7 bugs, including a real bug in the gcc 2.95.2 compiler, show that our approach is highly effective at isolating only the relevant anomalies. Compared to state-of-art debugging techniques, our proposed approach pinpoints the defect locations more accurately and presents the user with a much smaller code set to analyze.}, booktitle={Proceeding of the 14th international conference on Architectural support for programming languages and operating systems - ASPLOS '09}, publisher={ACM Press}, author={Dimitrov, Martin and Zhou, Huiyang}, year={2009} } @inproceedings{kong_aciicmez_seifert_zhou_2009, title={Hardware-software integrated approaches to defend against software cache-based side channel attacks}, ISBN={9781424429325}, url={http://dx.doi.org/10.1109/hpca.2009.4798277}, DOI={10.1109/hpca.2009.4798277}, abstractNote={Software cache-based side channel attacks present serious threats to modern computer systems. Using caches as a side channel, these attacks are able to derive secret keys used in cryptographic operations through legitimate activities. Among existing countermeasures, software solutions are typically application specific and incur substantial performance overhead. Recent hardware proposals including the Partition-Locked cache (PLcache) and Random-Permutation cache (RPcache) [23], although very effective in reducing performance overhead while enhancing the security level, may still be vulnerable to advanced cache attacks. In this paper, we propose three hardware-software approaches to defend against software cache-based attacks - they present different tradeoffs between hardware complexity and performance overhead. First, we propose to use preloading to secure the PLcache. Second, we leverage informing loads, which is a lightweight architectural support originally proposed to improve memory performance, to protect the RPcache. Third, we propose novel software permutation to replace the random permutation hardware in the RPcache. This way, regular caches can be protected with hardware support for informing loads. In our experiments, we analyze various processor models for their vulnerability to cache attacks and demonstrate that even to the processor model that is most vulnerable to cache attacks, our proposed software-hardware integrated schemes provide strong security protection.}, booktitle={2009 IEEE 15th International Symposium on High Performance Computer Architecture}, publisher={IEEE}, author={Kong, J. and Aciicmez, O. and Seifert, J.-P. and Zhou, Huiyang}, year={2009}, month={Feb} } @inproceedings{dimitrov_mantor_zhou_2009, title={Understanding software approaches for GPGPU reliability}, ISBN={9781605585178}, url={http://dx.doi.org/10.1145/1513895.1513907}, DOI={10.1145/1513895.1513907}, abstractNote={Even though graphics processors (GPUs) are becoming increasingly popular for general purpose computing, current (and likely near future) generations of GPUs do not provide hardware support for detecting soft/hard errors in computation logic or memory storage cells since graphics applications are inherently fault tolerant. As a result, if an error occurs in GPUs during program execution, the results could be silently corrupted, which is not acceptable for general purpose computations. To improve the fidelity of general purpose computation on GPUs (GPGPU), we investigate software approaches to perform redundant execution. In particular, we propose and study three different, application-level techniques. The first technique simply executes the GPU kernel program twice, and thus achieves roughly half of the throughput of a non-redundant execution. The next two techniques interleave redundant execution with the original code in different ways to take advantage of the parallelism between the original code and its redundant copy. Furthermore, we evaluate the benefits of providing hardware support, including ECC/parity protection to on-chip and off-chip memories, for each of the software techniques. Interestingly, our findings, based on six commonly used applications, indicate that the benefits of complex software approaches are both application and architecture dependent. The simple approach, which executes the kernel twice, is often sufficient and may even outperform the complex ones. Moreover, we argue that the cost is not justified to protect memories with ECC/parity bits.}, booktitle={Proceedings of 2nd Workshop on General Purpose Processing on Graphics Processing Units - GPGPU-2}, publisher={ACM Press}, author={Dimitrov, Martin and Mantor, Mike and Zhou, Huiyang}, year={2009} } @inproceedings{gao_ma_dimitrov_zhou_2008, title={Address-branch correlation: A novel locality for long-latency hard-to-predict branches}, ISBN={9781424420704}, ISSN={1530-0897}, url={http://dx.doi.org/10.1109/hpca.2008.4658629}, DOI={10.1109/hpca.2008.4658629}, abstractNote={Hard-to-predict branches depending on long-latency cache-misses have been recognized as a major performance obstacle for modern microprocessors. With the widening speed gap between memory and microprocessors, such long-latency branch mispredictions also waste substantial power/energy in executing instructions on wrong paths, especially for large instruction window processors. This paper presents a novel program locality that can be exploited to handle long-latency hard-to-predict branches. The locality is a result of an interesting program execution behavior: for some applications, major data structures or key components of the data structures tend to remain stable for a long time. If a hard-to-predict branch depends on such stable data, the address of the data rather than the data value is sufficient to determine the branch outcome. This way, a misprediction can be resolved much more promptly when the data access results in a long-latency cache miss. We call such locality address-branch correlation and we show that certain memory-intensive benchmarks, especially those with heavy pointer chasing, exhibit this locality. We then propose a low-cost auxiliary branch predictor to exploit address-branch correlation. Our experimental results show that the proposed scheme reduces the execution time by 6.3% (up to 27%) and energy consumption by 5.2% (up to 24%) for a set of memory-intensive benchmarks with a 9 kB prediction table when used with a state-of-art 16 kB TAGE predictor.}, booktitle={2008 IEEE 14th International Symposium on High Performance Computer Architecture}, publisher={IEEE}, author={Gao, Hongliang and Ma, Yi and Dimitrov, Martin and Zhou, Huiyang}, year={2008}, month={Feb} } @inproceedings{kong_aciicmez_seifert_zhou_2008, title={Deconstructing new cache designs for thwarting software cache-based side channel attacks}, ISBN={9781605583006}, url={http://dx.doi.org/10.1145/1456508.1456514}, DOI={10.1145/1456508.1456514}, abstractNote={Software cache-based side channel attacks present a serious tthreat to computer systems. Previously proposed countermeasures were either too costly for practical use or only effective against particular attacks. Thus, a recent work identified cache interferences in general as the root cause and proposed two new cache designs, namely partition-locked cache (PLcache) and random permutation cache(RPcache), to defeat cache-based side channel attacks by eliminating/obfuscating cache interferences. In this paper, we analyze these new cache designs and identify significant vulnerabilities and shortcomings of those new cache designs. We also propose possible solutions and improvements over the original new cache designs to overcome the identified shortcomings.}, booktitle={Proceedings of the 2nd ACM workshop on Computer security architectures - CSAW '08}, publisher={ACM Press}, author={Kong, Jingfei and Aciicmez, Onur and Seifert, Jean-Pierre and Zhou, Huiyang}, year={2008} } @article{ma_gao_dimitrov_zhou_2007, title={Optimizing dual-core execution for power efficiency and transient-fault recovery}, volume={18}, ISSN={["1558-2183"]}, DOI={10.1109/tpds.2007.4288106}, abstractNote={Dual-core execution (DCE) is an execution paradigm proposed to utilize chip multiprocessors to improve the performance of single-threaded applications. Previous research has shown that DCE provides a complexity-effective approach to building a highly scalable instruction window and achieves significant latency-hiding capabilities. In this paper, we propose to optimize DCE for power efficiency and/or transient-fault recovery. In DCE, a program is first processed (speculatively) in the front processor and then reexecuted by the back processor. Such reexecution is the key to eliminating the centralized structures that are normally associated with very large instruction windows. In this paper, we exploit the computational redundancy in DCE to improve its reliability and its power efficiency. The main contributions include: 1) DCE-based redundancy checking for transient-fault tolerance and a complexity-effective approach to achieving full redundancy coverage and 2) novel techniques to improve the power/energy efficiency of DCE-based execution paradigms. Our experimental results demonstrate that, with the proposed simple techniques, the optimized DCE can effectively achieve transient-fault tolerance or significant performance enhancement in a power/energy-efficient way. Compared to the original DCE, the optimized DCE has similar speedups (34 percent on average) over single-core processors while reducing the energy overhead from 93 percent to 31 percent.}, number={8}, journal={IEEE TRANSACTIONS ON PARALLEL AND DISTRIBUTED SYSTEMS}, author={Ma, Yi and Gao, Hongliang and Dimitrov, Martin and Zhou, Huiyang}, year={2007}, month={Aug}, pages={1080–1093} } @article{gao_zhou_2007, title={PMPM: Prediction by combining multiple partial matches}, volume={9}, journal={Journal of Instruction-level Parallelism}, author={Gao, H. and Zhou, H.}, year={2007}, pages={1–18} } @inproceedings{dimitrov_zhou_2007, title={Unified Architectural Support for Soft-Error Protection or Software Bug Detection}, ISBN={0769529445 9780769529448}, ISSN={1089-795X}, url={http://dx.doi.org/10.1109/pact.2007.4336201}, DOI={10.1109/pact.2007.4336201}, abstractNote={In this paper we propose a unified architectural support that can be used flexibly for either soft-error protection or software bug detection. Our approach is based on dynamically detecting and enforcing instruction- level invariants. A hardware table is designed to keep track of run-time invariant information. During program execution, instructions access this table and compare their produced results against the stored invariants. Any violation of the predicted invariant suggests a potential abnormal behavior, which could be a result of a soft error or a latent software bug. In case of a soft error, monitoring invariant violations provides opportunistic soft-error protection to multiple structures in processor pipelines. Our experimental results show that invariant violations detect soft errors promptly and as a result, simple pipeline squashing is able to fix most of the detected soft errors. Meanwhile, the same approach can be easily adapted for software bug detection. The proposed architectural support eliminates the substantial performance overhead associated with software-based bug-detection approaches and enables continuous monitoring of production code.}, booktitle={16th International Conference on Parallel Architecture and Compilation Techniques (PACT 2007)}, publisher={IEEE}, author={Dimitrov, Martin and Zhou, Huiyang}, year={2007}, month={Sep} } @inproceedings{ma_zhou_2006, title={Efficient Transient-Fault Tolerance for Multithreaded Processors Using Dual-Thread Execution}, ISBN={9780780397064 9780780397071}, ISSN={1063-6404}, url={http://dx.doi.org/10.1109/iccd.2006.4380804}, DOI={10.1109/iccd.2006.4380804}, abstractNote={Reliability becomes a key issue in computer system design as microprocessors are increasingly susceptible to transient faults. Many previously proposed schemes exploit simultaneous multithreaded (SMT) architectures to achieve transient-fault tolerance by running a program concurrently on two threads, a main thread and a redundant checker thread. Such schemes however often incur high performance overheads due to resource contention and redundancy checking. In this paper, we propose dual-thread execution (DTE) for SMT processors to efficiently achieve transient-fault tolerance. DTE is derived from the recently proposed fault-tolerant dual-core execution (FTDCE) paradigm, in which two processor cores on a single chip perform redundant execution to improve both reliability and performance. In this paper, we apply the same principles as in FTDCE to SMT architectures and explore fetch policies to address the critical resource-sharing issue in SMT architectures. Our experimental results show that DTE achieves an average of 56.1% speedup over the previously proposed simultaneously and redundantly threaded processor with recovery (SRTR). More impressively, even compared to single-thread execution, DTE achieves full-coverage transient-fault tolerance along with an average of 15.5% performance improvement.}, booktitle={2006 International Conference on Computer Design}, publisher={IEEE}, author={Ma, Yi and Zhou, Huiyang}, year={2006}, month={Oct} } @inproceedings{kong_zou_zhou_2006, title={Improving software security via runtime instruction-level taint checking}, ISBN={1595935762}, url={http://dx.doi.org/10.1145/1181309.1181313}, DOI={10.1145/1181309.1181313}, abstractNote={Current taint checking architectures monitor tainted data usage mainly with control transfer instructions. An alarm is raised once the program counter becomes tainted. However, such architectures are not effective against non-control data attacks. In this paper we present a generic instruction-level runtime taint checking architecture for handling non-control data attacks. Under our architecture, instructions are classified as either Taintless-Instructions or Tainted-Instructions prior to program execution. An instruction is called a Tainted-Instruction if it is supposed to deal with tainted data. Otherwise it is called a Taintless-Instruction. A security alert is raised whenever a Taintless-Instruction encounters tainted data at runtime. The proposed architecture is implemented on the SimpleScalar simulator. The preliminary results from experiments on SPEC CPU 2000 benchmarks show that there are a significant amount of Taintless-Instructions. We also demonstrate effective usages of our architecture to detect buffer overflow and format string attacks.}, booktitle={Proceedings of the 1st workshop on Architectural and system support for improving software dependability - ASID '06}, publisher={ACM Press}, author={Kong, Jingfei and Zou, Cliff C. and Zhou, Huiyang}, year={2006} } @inproceedings{dimitrov_zhou_2006, title={Locality-based Information Redundancy for Processor Reliability}, booktitle={2nd Workshop on Architectural Reliability (WAR-2) held in conjunction with 39th International Symposium on Microarchitecture (MICRO-39)}, author={Dimitrov, M. and Zhou, H.}, year={2006}, month={Dec}, pages={29–36} } @inproceedings{gao_zhou_2006, title={PMPM: Prediction by Combining Multiple Partial Matches}, booktitle={2nd Championship Branch Prediction (CBP-2) held with the 39th International Symposium on Microarchitecture (MICRO-39)}, author={Gao, H. and Zhou, H.}, year={2006}, month={Dec}, pages={19–24} } @article{ma y._zhou_2006, title={Using index functions to reduce conflict aliasing in branch prediction tables}, volume={55}, number={8}, journal={IEEE Transactions on Computers}, author={Ma Y., Gao H. and Zhou, H.}, year={2006}, pages={1057–1061} } @article{zhou_2005, title={A case for fault tolerance and performance enhancement using chip multi-processors}, volume={4}, journal={IEEE Computer Architecture Letters}, author={Zhou, H.}, year={2005}, pages={1–4} } @article{gao_zhou_2005, title={Adaptive information processing: an effective way to improve perceptron branch predictors}, volume={7}, journal={Journal of Instruction-level Parallelism}, author={Gao, H. and Zhou, H.}, year={2005}, pages={1–10} } @inproceedings{zhou_conte_2005, title={Code size efficiency in global scheduling for ILP processors}, ISBN={0769515347}, url={http://dx.doi.org/10.1109/intera.2002.995845}, DOI={10.1109/intera.2002.995845}, abstractNote={In global scheduling for ILP processors, region-enlarging optimizations, especially tail duplication, are commonly used. The code size increase due to such optimizations, however, raises serious concerns about the affected I-cache and TLB performance. In this paper, we propose a quantitative measure of the code size efficiency at compile time for any code size related optimization. Then, based on the efficiency of tail duplication, we propose the solutions to two related problems: (1) how to achieve the best performance for a given code size increase, (2) how to get the optimal code size efficiency for any program. Our study shows that code size increase has a significant but varying impact on IPC, e.g., the first 2% code size increase results in 18.5% increase in static IPC, but less than 1% when the given code size further increases from 20% to 30%. We then use this feature to define the optimal code size efficiency and to derive a simple, yet robust threshold scheme finding it. The experimental results using SPECint95 benchmarks show that this threshold scheme finds the optimal efficiency accurately. While the optimal efficiency results show an average increase of 2% in code size, the improved I-cache performance is observed and a speedup of 17% over the natural treegion results is achieved.}, booktitle={Proceedings Sixth Annual Workshop on Interaction between Compilers and Computer Architectures}, publisher={IEEE Comput. Soc}, author={Zhou, Huiyang and Conte, T.M.}, year={2005}, month={Aug} } @inproceedings{zhou_flanagan_conte_2005, title={Detecting global stride locality in value streams}, ISBN={0769519458}, url={http://dx.doi.org/10.1109/isca.2003.1207011}, DOI={10.1109/isca.2003.1207011}, abstractNote={Value prediction exploits localities in value streams. Previous research focused on exploiting two types of value localities, computational and context-based, in the local value history, which is the value sequence produced by the same instruction that is being predicted. Besides the local value history, value locality also exists in the global value history, which is the value sequence produced by all dynamic instructions according to their execution order. In this paper, a new type value locality, the computational locality in the global value history is studied. A novel prediction scheme, called the gDiff predictor, is designed to exploit one special and most common case of this computational model, the stridebased computation, in the global value history. Such a scheme provides a general framework to exploit global stride locality in any value stream. Experiments show that there exists very strong stride type of locality in global value sequences. Ideally, the gDiff predictor can achieve 73% prediction accuracy for all value producing instructions without any hybrid scheme, much higher than local stride and local context prediction schemes. However, the capability of realistically exploiting locality in global value history is greatly challenged by the value delay issue, i.e., the correlated value may not be available when the prediction is being made. We study the value delay issue in an out-of-order (OOO) execution pipeline model and propose a new hybrid scheme to maximize the exploitation of the global stride locality. This new hybrid scheme shows 91% prediction accuracy and 64% coverage for all value producing instructions. We also show that the global stride locality detected by gDiff in load address streams provides strong capabilities in predicting load addresses (coverage 63% and accuracy 86%) and in predicting addresses of missing loads (33% coverage and 53% accuracy).}, booktitle={30th Annual International Symposium on Computer Architecture, 2003. Proceedings.}, publisher={IEEE Comput. Soc}, author={Zhou, Huiyang and Flanagan, J. and Conte, T.M.}, year={2005}, month={Apr} } @inproceedings{zhou_2005, title={Dual-core execution: building a highly scalable single-thread instruction window}, ISBN={076952429X}, url={http://dx.doi.org/10.1109/pact.2005.18}, DOI={10.1109/pact.2005.18}, abstractNote={Current integration trends embrace the prosperity of single-chip multi-core processors. Although multi-core processors deliver significantly improved system throughput, single-thread performance is not addressed. In this paper, we propose a new execution paradigm that utilizes multi-cores on a single chip collaboratively to achieve high performance for single-thread memory-intensive workloads while maintaining the flexibility to support multithreaded applications. The proposed execution paradigm, dual-core execution, consists of two superscalar cores (a front and back processor) coupled with a queue. The front processor fetches and preprocesses instruction streams and retires processed instructions into the queue for the back processor to consume. The front processor executes instructions as usual except for cache-missing loads, which produce an invalid value instead of blocking the pipeline. As a result, the front processor runs far ahead to warm up the data caches and fix branch mispredictions for the back processor. In-flight instructions are distributed in the front processor, the queue, and the back processor, forming a very large instruction window for single-thread out-of-order execution. The proposed architecture incurs only minor hardware changes and does not require any large centralized structures such as large register files, issue queues, load/store queues, or reorder buffers. Experimental results show remarkable latency hiding capabilities of the proposed architecture, even outperforming more complex single-thread processors with much larger instruction windows than the front or back processor.}, booktitle={14th International Conference on Parallel Architectures and Compilation Techniques (PACT'05)}, publisher={IEEE}, author={Zhou, Huiyang}, year={2005} } @article{huiyang_conte_2005, title={Enhancing memory-level parallelism via recovery-free value prediction}, volume={54}, DOI={10.1109/tc.2005.117}, abstractNote={The ever-increasing computational power of contemporary microprocessors reduces the execution time spent on arithmetic computations (i.e., the computations not involving slow memory operations such as cache misses) significantly. Therefore, for memory-intensive workloads, it becomes more important to overlap multiple cache misses than to overlap slow memory operations with other computations. In this paper, we propose a novel technique to parallelize sequential cache misses, thereby increasing memory-level parallelism (MLP). Our idea is based on value prediction, which was proposed originally as an instruction-level parallelism (ILP) optimization to break true data dependencies. In this paper, we advocate value prediction in its capability to enhance MLP instead of ILP. We propose using value prediction and value-speculative execution only for prefetching so that not only the complex prediction validation and misprediction recovery mechanisms are avoided, but better performance can also be achieved for memory-intensive workloads. The minor hardware modifications that are required also enable aggressive memory disambiguation for prefetching. The experimental results show that our technique enhances MLP effectively and achieves significant speedups, even with a simple stride value predictor.}, journal={IEEE Transactions on Computers}, author={Huiyang and Conte, T. M.}, year={2005}, pages={897–912} } @inproceedings{gao_zhou_2004, title={Adaptive Information Processing: An Effective Way to Improve Perceptron Branch Predictors}, booktitle={1st Championship Branch Prediction (CBP-1) held with the 37th International Symposium on Microarchitecture (MICRO-37)}, author={Gao, H. and Zhou, H.}, year={2004}, month={Dec} } @article{huiyang_toburen_rotenberg_conte_2003, title={Adaptive mode control: A static-power-efficient cache design}, volume={2}, DOI={10.1145/860176.860181}, abstractNote={Lower threshold voltages in deep submicron technologies cause more leakage current, increasing static power dissipation. This trend, combined with the trend of larger/more cache memories dominating die area, has prompted circuit designers to develop SRAM cells with low-leakage operating modes (e.g., sleep mode). Sleep mode reduces static power dissipation, but data stored in a sleeping cell is unreliable or lost. So, at the architecture level, there is interest in exploiting sleep mode to reduce static power dissipation while maintaining high performance.Current approaches dynamically control the operating mode of large groups of cache lines or even individual cache lines. However, the performance monitoring mechanism that controls the percentage of sleep-mode lines, and identifies particular lines for sleep mode, is somewhat arbitrary. There is no way to know what the performance could be with all cache lines active, so arbitrary miss rate targets are set (perhaps on a per-benchmark basis using profile information), and the control mechanism tracks these targets. We propose applying sleep mode only to the data store and not the tag store. By keeping the entire tag store active the hardware knows what the hypothetical miss rate would be if all data lines were active, and the actual miss rate can be made to precisely track it. Simulations show that an average of 73% of I-cache lines and 54% of D-cache lines are put in sleep mode with an average IPC impact of only 1.7%, for 64 KB caches.}, number={3}, journal={ACM Transactions on Embedded Computing Systems}, author={Huiyang and Toburen, M. C. and Rotenberg, E. and Conte, T. M.}, year={2003}, pages={347–372} } @book{zhou_2003, title={Code size aware compilation for real-time applications}, institution={Computer Science Department, University of Central Florida}, author={Zhou, H.}, year={2003}, month={Jul} } @inproceedings{zhou_conte_2003, title={Enhancing Memory Level Parallelism via Recovery-Free Value Prediction}, booktitle={The 2003 International Conference on Supercomputing (ICS'03)}, author={Zhou, H. and Conte, T.M.}, year={2003}, month={Jun}, pages={326–335} } @book{zhou_conte_2003, place={Raleigh, NC}, title={Performance modeling of memory latency hiding techniques}, institution={Department of Electrical and Computer Engineering, North Carolina State University}, author={Zhou, H. and Conte, T.M.}, year={2003}, month={Jan} } @inbook{zhou_jennings_conte_2003, title={Tree Traversal Scheduling: A Global Instruction Scheduling Technique for VLIW/EPIC Processors}, volume={2624}, ISBN={9783540040293 9783540357674}, ISSN={0302-9743}, url={http://dx.doi.org/10.1007/3-540-35767-x_15}, DOI={10.1007/3-540-35767-x_15}, abstractNote={Global scheduling in a treegion framework has been proposed to exploit instruction level parallelism (ILP) at compile time. A treegion is a single-entry / multiple-exit global scheduling scope that consists of basic blocks with control-flow forming a tree. Because a treegion scope is nonlinear (includes multiple paths) it is distinguished from linear scopes such as traces or superblocks. Treegion scheduling has the capability of speeding up all possible paths within the scheduling scope. This paper presents a new global scheduling algorithm using treegions called Tree Traversal Scheduling (TTS). Efficient, incremental data-flow analysis in support of TTS is also presented. Performance results are compared to the scheduling of the linear regions that result from the decomposition of treegions. We refer to these resultant linear regions as linear treegions (LT) and consider them analogous to superblocks with the same amount of code expansion as the base treegion. Experimental results for TTS scheduling show a 35% speedup compared to basic block (BB) scheduling and a 4% speedup compared to LT scheduling.}, booktitle={Languages and Compilers for Parallel Computing}, publisher={Springer Berlin Heidelberg}, author={Zhou, Huiyang and Jennings, Matthew D. and Conte, Thomas M.}, year={2003}, pages={223–238} } @book{zhou_conte_2002, place={Raleigh, NC}, title={Using Performance Bounds to Guide Pre-scheduling Code Optimizations}, institution={Department of Electrical and Computer Engineering, North Carolina State University}, author={Zhou, H. and Conte, T.M.}, year={2002}, month={Sep} } @book{jennings_zhou_conte_2001, place={Raleigh, NC}, title={A Treegion-based Unified Approach to Speculation and Predication in Global Instruction Scheduling}, institution={Department of Electrical and Computer Engineering, North Carolina State University}, author={Jennings, M.D. and Zhou, H. and Conte, T.M.}, year={2001}, month={Aug} } @book{zhou_fu_rotenberg_conte_2001, place={Raleigh, NC}, title={A study of value speculative execution and mispeculation recovery in superscalar microprocessors}, institution={Department of Electrical and Computer Engineering, North Carolina State University}, author={Zhou, H. and Fu, C. and Rotenberg, E. and Conte, T.}, year={2001}, month={Jan} } @inproceedings{huiyang_toburen_rotenberg_conte_2001, title={Adaptive mode control: A static-power-efficient cache design}, ISBN={0769513638}, DOI={10.1109/pact.2001.953288}, abstractNote={Lower threshold voltages in deep sub-micron technologies cause store leakage current, increasing static power dissipation. This trend, combined with the trend of larger/more cache memories dominating die area, has prompted circuit designers to develop SRAM cells with low-leakage operating modes (e.g., sleep mode). Sleep mode reduces static power dissipation but data stored in a sleeping cell is unreliable or lost. So, at the architecture level, there is interest in exploiting sleep mode to reduce static power dissipation while maintaining high performance. Current approaches dynamically control the operating mode of large groups of cache lines or even individual cache lines. However, the performance monitoring mechanism that controls the percentage of sleep-mode lines, and identifies particular lines for sleep mode, is somewhat arbitrary. There is no way to know what the performance could be with all cache lines active, so arbitrary miss rate targets are set (perhaps on a per-benchmark basis using profile information) and the control mechanism tracks these targets. We propose applying sleep mode only to the data store and not the tag store. By keeping the entire tag store active, the hardware knows what the hypothetical miss rate would be if all data lines were active and the actual miss rate can be made to precisely track it. Simulations show an average of 73% of I-cache lines and 54% of D-cache lines are put in sleep mode with an average IPC impact of only 1.7%, for 64KB caches.}, booktitle={2001 International Conference on Parallel Architectures and Compilation Techniques: Proceedings: 8-12 September, 2001, Barcelona, Catalunya, Spain}, publisher={Los Alamitos, CA: IEEE Computer Society}, author={Huiyang and Toburen, M. C. and Rotenberg, E. and Conte, T. M.}, year={2001}, pages={61–70} } @book{zhou_toburen_rotenberg_conte_2000, place={Raleigh, NC}, title={Adaptive Mode Control: A Low-Leakage Power-Efficient Cache Design}, institution={Department of Electrical and Computer Engineering, North Carolina State University}, author={Zhou, H. and Toburen, M. and Rotenberg, E. and Conte, T.}, year={2000}, month={Nov} } @article{kassim_huiyang_raganath_2000, title={Automatic IC orientation checks}, volume={12}, DOI={10.1007/s001380050129}, number={3}, journal={Machine Vision and Applications}, author={Kassim, A. A. and Huiyang and Raganath, S.}, year={2000}, pages={107–112} } @article{zhou_kassim_ranganath_1998, title={A fast algorithm for detecting die extrusion defects in IC packages}, volume={11}, ISSN={["0932-8092"]}, DOI={10.1007/s001380050088}, number={1}, journal={MACHINE VISION AND APPLICATIONS}, author={Zhou, H and Kassim, AA and Ranganath, S}, year={1998}, pages={37–41} } @article{zhou_qu_li_1996, title={Test sequencing and diagnosis in electronic system with decision table}, volume={36}, ISSN={["0026-2714"]}, DOI={10.1016/0026-2714(95)00142-5}, abstractNote={In this paper, the decision table is used as a tool for representation of the test knowledge. With that table, the algorithm for building optimal decision trees, which embody the solution for the test sequencing and diagnosis problem, is analyzed. Some improvements in the algorithm are also proposed for better efficiency. Furthermore, in order to describe more complicated situation, the conditional probabilities are included in the decision table, called as conditional decision table. Different approaches for generating optimal conditional decision trees, based on the information theory and entropy, are proposed. Finally, the algorithm for building dynamic test and diagnostic procedures is also presented in this paper.}, number={9}, journal={MICROELECTRONICS AND RELIABILITY}, author={Zhou, HY and Qu, LS and Li, AH}, year={1996}, month={Sep}, pages={1167–1175} }