@article{xu_pangia_ye_solihin_shen_2024, title={Data Enclave: A Data-Centric Trusted Execution Environment}, ISBN={["979-8-3503-9314-9"]}, ISSN={["1530-0897"]}, DOI={10.1109/HPCA57654.2024.00026}, journal={2024 IEEE INTERNATIONAL SYMPOSIUM ON HIGH-PERFORMANCE COMPUTER ARCHITECTURE, HPCA 2024}, author={Xu, Yuanchao and Pangia, James and Ye, Chencheng and Solihin, Yan and Shen, Xipeng}, year={2024}, pages={218–232} } @inproceedings{niu_sanim_shu_guan_shen_yin_agrawal_ren_2024, title={SmartMem: Layout Transformation Elimination and Adaptation for Efficient DNN Execution on Mobile}, url={https://doi.org/10.1145/3620666.3651384}, DOI={10.1145/3620666.3651384}, abstractNote={This work is motivated by recent developments in Deep Neural Networks, particularly the Transformer architectures underlying applications such as ChatGPT, and the need for performing inference on mobile devices. Focusing on emerging transformers (specifically the ones with computationally efficient Swin-like architectures) and large models (e.g., Stable Diffusion and LLMs) based on transformers, we observe that layout transformations between the computational operators cause a significant slowdown in these applications. This paper presents SmartMem, a comprehensive framework for eliminating most layout transformations, with the idea that multiple operators can use the same tensor layout through careful choice of layout and implementation of operations. Our approach is based on classifying the operators into four groups, and considering combinations of producer-consumer edges between the operators. We develop a set of methods for searching such layouts. Another component of our work is developing efficient memory layouts for 2.5 dimensional memory commonly seen in mobile devices. Our experimental results show that SmartMem outperforms 5 state-of-the-art DNN execution frameworks on mobile devices across 18 varied neural networks, including CNNs, Transformers with both local and global attention, as well as LLMs. In particular, compared to DNNFusion, SmartMem achieves an average speedup of 2.8×, and outperforms TVM and MNN with speedups of 6.9× and 7.9×, respectively, on average.}, author={Niu, Wei and Sanim, Md Musfiqur Rahman and Shu, Zhihao and Guan, Jiexiong and Shen, Xipeng and Yin, Miao and Agrawal, Gagan and Ren, Bin}, year={2024}, month={Apr} } @inproceedings{huang_zhai_zheng_wang_jin_zhang_zhang_zheng_yi_shen_2024, title={WiseGraph: Optimizing GNN with Joint Workload Partition of Graph and Operations}, url={https://doi.org/10.1145/3627703.3650063}, DOI={10.1145/3627703.3650063}, author={Huang, Kezhao and Zhai, Jidong and Zheng, Liyan and Wang, Haojie and Jin, Yuyang and Zhang, Qihao and Zhang, Runqing and Zheng, Zhen and Yi, Youngmin and Shen, Xipeng}, year={2024}, month={Apr} } @article{chen_sung_shen_tallent_barker_li_2023, title={Accelerating matrix-centric graph processing on GPUs through bit-level optimizations}, volume={177}, ISSN={["1096-0848"]}, DOI={10.1016/j.jpdc.2023.02.013}, abstractNote={Even though it is well known that binary values are common in graph applications (e.g., adjacency matrix), how to leverage the phenomenon for efficiency has not yet been adequately explored. This paper presents a systematic study on how to unlock the potential of the bit-level optimizations of graph computations that involve binary values. It proposes a two-level representation named Bit-Block Compressed Sparse Row (B2SR) and presents a series of optimizations to the graph operations on B2SR by the intrinsics of modern GPUs. It additionally introduces Deep Reinforcement Learning (DRL) as an efficient way to best configure the bit-level optimizations on the fly. The DQN-based adaptive tile size selector with dedicated model training can reach 68% prediction accuracy. Evaluations on the NVIDIA Pascal and Volta GPUs show that the optimizations bring up to 40 and 6555 for essential GraphBLAS kernels SpMV and SpGEMM, respectively, accelerating GraphBLAS-based BFS by up to 433, SSSP, PR, and CC 35, and TC 52.}, journal={JOURNAL OF PARALLEL AND DISTRIBUTED COMPUTING}, author={Chen, Jou-An and Sung, Hsin-Hsuan and Shen, Xipeng and Tallent, Nathan and Barker, Kevin and Li, Ang}, year={2023}, month={Jul}, pages={53–67} } @article{zhang_mariano_shen_dillig_2023, title={Automated Translation of Functional Big Data similar to eries to SQL}, volume={7}, ISSN={["2475-1421"]}, url={https://doi.org/10.1145/3586047}, DOI={10.1145/3586047}, abstractNote={ Big data analytics frameworks like Apache Spark and Flink enable users to implement queries over large, distributed databases using functional APIs. In recent years, these APIs have grown in popularity because their functional interfaces abstract away much of the minutiae of distributed programming required by traditional query languages like SQL. However, the convenience of these APIs comes at a cost because functional queries are often less efficient than their SQL counterparts. Motivated by this observation, we present a new technique for automatically transpiling functional queries to SQL. While our approach is based on the standard paradigm of counterexample-guided inductive synthesis, it uses a novel column-wise decomposition technique to split the synthesis task into smaller subquery synthesis problems. We have implemented this approach as a new tool called RDD2SQL for translating Spark RDD queries to SQL and empirically evaluate the effectiveness of RDD2SQL on a set of real-world RDD queries. Our results show that (1) most RDD queries can be translated to SQL, (2) our tool is very effective at automating this translation, and (3) performing this translation offers significant performance benefits. }, number={OOPSLA}, journal={PROCEEDINGS OF THE ACM ON PROGRAMMING LANGUAGES-PACMPL}, author={Zhang, Guoqiang and Mariano, Benjamin and Shen, Xipeng and Dillig, Isil}, year={2023}, month={Apr} } @article{chen_sung_shen_choudhury_li_2023, title={BitGNN: Unleashing the Performance Potential of Binary Graph Neural Networks on GPUs}, url={https://doi.org/10.1145/3577193.3593725}, DOI={10.1145/3577193.3593725}, abstractNote={Recent studies have shown that Binary Graph Neural Networks (GNNs) are promising for saving computations of GNNs through binarized tensors. Prior work, however, mainly focused on algorithm designs or training techniques, leaving it open to how to materialize the performance potential on accelerator hardware fully. This work redesigns the binary GNN inference backend from the efficiency perspective. It fills the gap by proposing a series of abstractions and techniques to map binary GNNs and their computations best to fit the nature of bit manipulations on GPUs. Results on real-world graphs with GCNs, GraphSAGE, and GraphSAINT show that the proposed techniques outperform state-of-the-art binary GNN implementations by 8-22X with the same accuracy maintained. BitGNN code is publicly available.1.}, journal={PROCEEDINGS OF THE 37TH INTERNATIONAL CONFERENCE ON SUPERCOMPUTING, ACM ICS 2023}, author={Chen, Jou-An and Sung, Hsin-Hsuan and Shen, Xipeng and Choudhury, Sutanay and Li, Ang}, year={2023}, pages={264–276} } @article{chen_zhang_guan_zhai_shen_zhang_shu_du_2023, title={CompressGraph: Efficient Parallel Graph Analytics with Rule-Based Compression}, url={https://doi.org/10.1145/3588684}, DOI={10.1145/3588684}, abstractNote={Modern graphs exert colossal time and space pressure on graph analytics applications. In 2022, Facebook social graph reaches 2.91 billion users with trillions of edges. Many compression algorithms have been developed to support direct processing on compressed graphs to address this challenge. However, previous graph compression algorithms do not focus on leveraging redundancy in repeated neighbor sequences, so they do not save the amount of computation for graph analytics. We develop CompressGraph, an efficient rule-based graph analytics engine that leverages data redundancy in graphs to achieve both performance boost and space reduction for common graph applications. CompressGraph has three advantages over previous works. First, the rule-based abstraction of CompressGraph supports the reuse of intermediate results during graph traversal, thus saving time. Second, CompressGraph has intense expressiveness to support a wide range of graph applications. Third, CompressGraph scales well under high parallelism because the context-free rules have few dependencies. Experiments show that CompressGraph provides significant performance and space benefits on both CPUs and GPUs. On evaluating six typical graph applications, CompressGraph can achieve 1.97× speedup on the CPU, while 3.95× speedup on the GPU, compared to the state-of-the-art CPU and GPU methods, respectively. Moreover, CompressGraph can save an average of 71.27% memory savings on CPU and 70.36 on GPU.}, journal={Proceedings of the ACM on Management of Data}, author={Chen, Zheng and Zhang, Feng and Guan, JiaWei and Zhai, Jidong and Shen, Xipeng and Zhang, Huanchen and Shu, Wentong and Du, Xiaoyong}, year={2023}, month={May} } @article{zhang_wu_guan_zheng_guo_zhang_du_shen_2023, title={Expanding the Edge: Enabling Efficient Winograd CNN Inference With Deep Reuse on Edge Device}, volume={35}, ISSN={["1558-2191"]}, url={https://doi.org/10.1109/TKDE.2023.3269017}, DOI={10.1109/TKDE.2023.3269017}, abstractNote={Deep learning on edge devices is becoming increasingly important, especially with the explosion of IoT devices. For example, the total number of devices connected to IoT reaches 29 billion in 2022. Convolutional neural networks (CNNs), as common deep learning representatives, are among the most popular neural networks in knowledge and data engineering. However, CNN employs a high degree of computing. In comparison to the training phase, the inference process is more frequently done on low-power computing equipments, such as edge devices. The limited computing resource and high computation pressure limit the effective use of CNN algorithms at the edge. Fortunately, a minimal filtering algorithm called Winograd can reduce convolution calculations by minimizing multiplication operations. We find that Winograd convolution can be accelerated further by deep reuse technique, which reuses the similar data and computation processes. In this paper, we propose a new inference method, called DREW, which combines deep reuse with Winograd for further accelerating CNNs. DREW handles three difficulties. First, it can detect the similarities from the complex minimal filtering patterns by clustering. Second, it reduces the online clustering cost in a reasonable range. Third, it provides an adjustable method in clustering granularity balancing the performance and accuracy. We perform evaluation on Raspberry PI and NVIDIA Jetson AGX Xavier edge devices, and experiments show that on five popular networks, 1) DREW further accelerates the Winograd convolution by an average of 8.27× speedup. Even for the highly parallel Winograd implementation, DREW still can provide 2.21× speedup. 2) When DREW is applied to end-to-end Winograd CNN inferences, DREW achieves 5.94× the average performance speedup with no ($< $<0.4%) accuracy loss. 3) Energy consumption is an important factor for inference in practice. DREW reduces the number of convolution operations to 10% of the original operations, thus achieving up to 60% energy-efficiency benefits than the original Winograd inference.}, number={10}, journal={IEEE TRANSACTIONS ON KNOWLEDGE AND DATA ENGINEERING}, author={Zhang, Feng and Wu, Ruofan and Guan, Jiawei and Zheng, Zhen and Guo, Xiaoguang and Zhang, Xiao and Du, Xiaoyong and Shen, Xipeng}, year={2023}, month={Oct}, pages={10181–10196} } @article{ye_xu_shen_sha_liao_jin_solihin_2023, title={Reconciling Selective Logging and Hardware Persistent Memory Transaction}, ISSN={["1530-0897"]}, DOI={10.1109/HPCA56546.2023.10071088}, abstractNote={Log creation, maintenance, and its persist ordering are known to be performance bottlenecks for durable transactions on persistent memory. Existing hardware persistent memory transactions overlook an important opportunity for improving performance: some persistent data is algorithmically redundant such that it can be recovered from other data, removing the need for logging such data. The paper presents an ISA extension that enables selective logging for hardware persistent memory transactions for the first time. The ISA extension features two novel components: fine-grain logging and lazy persistency. Fine-grain logging allows hardware to log updates on data in the granularity of words without lengthening the critical path of data accesses. Lazy persistency allows updated data to remain in the cache after the transaction commits. Together, the new hardware persistent memory transaction outperforms the state-of-the-art hardware counterpart by 1.8× on average.}, journal={2023 IEEE INTERNATIONAL SYMPOSIUM ON HIGH-PERFORMANCE COMPUTER ARCHITECTURE, HPCA}, author={Ye, Chencheng and Xu, Yuanchao and Shen, Xipeng and Sha, Yan and Liao, Xiaofei and Jin, Hai and Solihin, Yan}, year={2023}, pages={664–676} } @article{ye_xu_shen_sha_liao_jin_solihin_2023, title={SpecPMT: Speculative Logging for Resolving Crash Consistency Overhead of Persistent Memory}, DOI={10.1145/3575693.3575696}, abstractNote={Crash consistency overhead is a long-standing barrier to the adoption of byte-addressable persistent memory in practice. Despite continuous progress, persistent transactions for crash consistency still incur a 5.6X slowdown, making persistent memory prohibitively costly in practical settings. This paper introduces speculative logging, a new method that forgoes most memory fences and reduces data persistence overhead by logging data values early. This technique enables a novel persistent transaction model, speculatively persistent memory transactions (SpecPMT). Our evaluation shows that SpecPMT reduces the execution time overheads of persistent transactions substantially to just 10%.}, journal={PROCEEDINGS OF THE 28TH ACM INTERNATIONAL CONFERENCE ON ARCHITECTURAL SUPPORT FOR PROGRAMMING LANGUAGES AND OPERATING SYSTEMS, VOL 2, ASPLOS 2023}, author={Ye, Chencheng and Xu, Yuanchao and Shen, Xipeng and Sha, Yan and Liao, Xiaofei and Jin, Hai and Solihin, Yan}, year={2023}, pages={762–777} } @article{chen_niu_ren_wang_shen_2023, title={Survey: Exploiting Data Redundancy for Optimization of Deep Learning}, volume={55}, ISSN={["1557-7341"]}, DOI={10.1145/3564663}, abstractNote={ Data redundancy is ubiquitous in the inputs and intermediate results of Deep Neural Networks (DNN) . It offers many significant opportunities for improving DNN performance and efficiency and has been explored in a large body of work. These studies have scattered in many venues across several years. The targets they focus on range from images to videos and texts, and the techniques they use to detect and exploit data redundancy also vary in many aspects. There is not yet a systematic examination and summary of the many efforts, making it difficult for researchers to get a comprehensive view of the prior work, the state of the art, differences and shared principles, and the areas and directions yet to explore. This article tries to fill the void. It surveys hundreds of recent papers on the topic, introduces a novel taxonomy to put the various techniques into a single categorization framework, offers a comprehensive description of the main methods used for exploiting data redundancy in improving multiple kinds of DNNs on data, and points out a set of research opportunities for future exploration. }, number={10}, journal={ACM COMPUTING SURVEYS}, author={Chen, Jou-An and Niu, Wei and Ren, Bin and Wang, Yanzhi and Shen, Xipeng}, year={2023}, month={Oct} } @article{chen_sung_shen_tallent_barker_li_2022, title={Bit-GraphBLAS: Bit-Level Optimizations of Matrix-Centric Graph Processing on GPU}, ISSN={["1530-2075"]}, DOI={10.1109/IPDPS53621.2022.00056}, abstractNote={In a general graph data structure like an adjacency matrix, when edges are homogeneous, the connectivity of two nodes can be sufficiently represented using a single bit. This insight has, however, not yet been adequately exploited by the existing matrix-centric graph processing frameworks. This work fills the void by systematically exploring the bit-level representation of graphs and the corresponding optimizations to the graph operations. It proposes a two-level representation named Bit-Block Compressed Sparse Row (B2SR) and presents a series of optimizations to the graph operations on B2SR by leveraging the intrinsics of modern GPUs. Evaluations on NVIDIA Pascal and Volta GPUs show that the optimizations bring up to 40× and 6555× for essential GraphBLAS kernels SpMV and SpGEMM, respectively, making GraphBLAS-based BFS accelerate up to 433×, SSSP, PR, and CC up to 35×, and TC up to 52×.}, journal={2022 IEEE 36TH INTERNATIONAL PARALLEL AND DISTRIBUTED PROCESSING SYMPOSIUM (IPDPS 2022)}, author={Chen, Jou-An and Sung, Hsin-Hsuan and Shen, Xipeng and Tallent, Nathan and Barker, Kevin and Li, Ang}, year={2022}, pages={515–525} } @article{sung_xu_guan_niu_ren_wang_liu_shen_2022, title={Brief Industry Paper: Enabling Level-4 Autonomous Driving on a Single $1k Off-the-Shelf Card}, ISSN={["1545-3421"]}, DOI={10.1109/RTAS54340.2022.00032}, abstractNote={In the past few years we have developed hardware computing systems for commercial autonomous vehicles, but inevitably the high development cost and long turn-around time have been major roadblocks for commercial deployment. Hence we also explored the potential of software optimization. This paper, for the first-time, shows that it is feasible to enable full leve1-4 autonomous driving workloads on a single off-the-shelf card (Jetson AGX Xavier) for less than ${\$}1\mathrm{k}$, an order of magnitude less than the state-of-the-art systems, while meeting all the requirements of latency. The success comes from the resolution of some important issues shared by existing practices through a series of measures and innovations.}, journal={2022 IEEE 28TH REAL-TIME AND EMBEDDED TECHNOLOGY AND APPLICATIONS SYMPOSIUM (RTAS)}, author={Sung, Hsin-Hsuan and Xu, Yuanchao and Guan, Jiexiong and Niu, Wei and Ren, Bin and Wang, Yanzhi and Liu, Shaoshan and Shen, Xipeng}, year={2022}, pages={297–300} } @article{wu_zhang_guan_zheng_du_shen_2022, title={DREW: Efficient Winograd CNN Inference with Deep Reuse}, DOI={10.1145/3485447.3511985}, abstractNote={Deep learning has been used in various domains, including Web services. Convolutional neural networks (CNNs), which are deep learning representatives, are among the most popular neural networks in Web systems. However, CNN employs a high degree of computing. In comparison to the training phase, the inference process is more frequently done on low-power computing equipments. The limited computing resource and high computation pressure limit the effective use of CNN algorithms in industry. Fortunately, a minimal filtering algorithm called Winograd can reduce convolution calculations by minimizing multiplication operations. We find that Winograd convolution can be sped up further by deep reuse technique, which reuses the similar data and computation processes. In this paper, we propose a new inference method, called DREW, which combines deep reuse with Winograd for further accelerating CNNs. DREW handles three difficulties. First, it can detect the similarities from the complex minimal filtering patterns by clustering. Second, it reduces the online clustering cost in a reasonable range. Third, it provides an adjustable method in clustering granularity balancing the performance and accuracy. Experiments show that 1) DREW further accelerates the Winograd convolution by an average of 2.06 × speedup; 2) when DREW is applied to end-to-end Winograd CNN inference, it achieves 1.71 × the average performance speedup with no (<0.4%) accuracy loss; 3) DREW reduces the number of convolution operations to 11% of the original operations on average.}, journal={PROCEEDINGS OF THE ACM WEB CONFERENCE 2022 (WWW'22)}, author={Wu, Ruofan and Zhang, Feng and Guan, Jiawei and Zheng, Zhen and Du, Xiaoyong and Shen, Xipeng}, year={2022}, pages={1807–1816} } @article{cicek_shen_ozturk_2022, title={Energy Efficient Boosting of GEMM Accelerators for DNN via Reuse}, volume={27}, ISSN={["1557-7309"]}, DOI={10.1145/3503469}, abstractNote={ Reuse-centric convolutional neural networks (CNN) acceleration speeds up CNN inference by reusing computations for similar neuron vectors in CNN’s input layer or activation maps. This new paradigm of optimizations is, however, largely limited by the overheads in neuron vector similarity detection, an important step in reuse-centric CNN. This article presents an in-depth exploration of architectural support for reuse-centric CNN. It addresses some major limitations of the state-of-the-art design and proposes a novel hardware accelerator that improves neuron vector similarity detection and reduces the energy consumption of reuse-centric CNN inference. The accelerator is implemented to support a wide variety of neural network settings with a banked memory subsystem. Design exploration is performed through RTL simulation and synthesis on an FPGA platform. When integrated into Eyeriss, the accelerator can potentially provide improvements up to 7.75 \( \times \) in performance. Furthermore, it can reduce the energy used for similarity detection up to 95.46%, and it can accelerate the convolutional layer up to 3.63 \( \times \) compared to the software-based implementation running on the CPU. }, number={5}, journal={ACM TRANSACTIONS ON DESIGN AUTOMATION OF ELECTRONIC SYSTEMS}, author={Cicek, Nihat Mert and Shen, Xipeng and Ozturk, Ozcan}, year={2022}, month={Sep} } @article{pan_zhang_zhou_zhai_shen_mutlu_du_2022, title={Exploring Data Analytics Without Decompression on Embedded GPU Systems}, volume={33}, ISSN={["1558-2183"]}, url={https://doi.org/10.1109/TPDS.2021.3119402}, DOI={10.1109/TPDS.2021.3119402}, abstractNote={With the development of computer architecture, even for embedded systems, GPU devices can be integrated, providing outstanding performance and energy efficiency to meet the requirements of different industries, applications, and deployment environments. Data analytics is an important application scenario for embedded systems. Unfortunately, due to the limitation of the capacity of the embedded device, the scale of problems handled by the embedded system is limited. In this paper, we propose a novel data analytics method, called G-TADOC, for efficient text analytics directly on compression on embedded GPU systems. A large amount of data can be compressed and stored in embedded systems, and can be processed directly in the compressed state, which greatly enhances the processing capabilities of the systems. Particularly, G-TADOC has three innovations. First, a novel fine-grained thread-level workload scheduling strategy for GPU threads has been developed, which partitions heavily-dependent loads adaptively in a fine-grained manner. Second, a GPU thread-safe memory pool has been developed to handle inconsistency with low synchronization overheads. Third, a sequence-support strategy is provided to maintain high GPU parallelism while ensuring sequence information for lossless compression. Moreover, G-TADOC involves special optimizations for embedded GPUs, such as utilizing the CPU-GPU shared unified memory. Experiments show that G-TADOC provides 13.2× average speedup compared to the state-of-the-art TADOC. G-TADOC also improves performance-per-cost by 2.6× and energy efficiency by 32.5× over TADOC.}, number={7}, journal={IEEE TRANSACTIONS ON PARALLEL AND DISTRIBUTED SYSTEMS}, publisher={Institute of Electrical and Electronics Engineers (IEEE)}, author={Pan, Zaifeng and Zhang, Feng and Zhou, Yanliang and Zhai, Jidong and Shen, Xipeng and Mutlu, Onur and Du, Xiaoyong}, year={2022}, month={Jul}, pages={1553–1568} } @article{xu_ye_solihin_shen_2022, title={FFCCD: Fence-Free Crash-Consistent Concurrent Defragmentation for Persistent Memory}, ISSN={["1063-6897"]}, DOI={10.1145/3470496.3527406}, abstractNote={Persistent Memory (PM) is increasingly supplementing or substituting DRAM as main memory. Prior work have focused on reusability and memory leaks of persistent memory but have not addressed a problem amplified by persistence, persistent memory fragmentation, which refers to the continuous worsening of fragmentation of persistent memory throughout its usage. This paper reveals the challenges and proposes the first systematic crash-consistent solution, Fence-Free Crash-consistent Concurrent Defragmentation (FFCCD). FFCCD resues persistent pointer format, root nodes and typed allocation provided by persistent memory programming model to enable concurrent defragmentation on PM. FFCCD introduces architecture support for concurrent defragmentation that enables a fence-free design and fast read barrier, reducing two major overheads of defragmenting persistent memory. The techniques is effective (28--73% fragmentation reduction) and fast (4.1% execution time overhead).}, journal={PROCEEDINGS OF THE 2022 THE 49TH ANNUAL INTERNATIONAL SYMPOSIUM ON COMPUTER ARCHITECTURE (ISCA '22)}, author={Xu, Yuanchao and Ye, Chencheng and Solihin, Yan and Shen, Xipeng}, year={2022}, pages={274–288} } @article{niu_guan_shen_wang_agrawal_ren_2022, title={GCD(2) : A Globally Optimizing Compiler for Mapping DNNs to Mobile DSPs}, ISSN={["1072-4451"]}, DOI={10.1109/MICRO56248.2022.00044}, abstractNote={More specialized chips are exploiting available high transistor density to expose parallelism at a large scale with more intricate instruction sets. This paper reports on a compilation system GCD2, developed to support complex Deep Neural Network (DNN) workloads on mobile DSP chips. We observe several challenges in fully exploiting this architecture, related to SIMD width, more complex SIMD/vector instructions, and VLIW pipeline with the notion of soft dependencies. GCD2 comprises the following contributions: 1) development of matrix layout formats that support the use of different novel SIMD instructions, 2) formulation and solution of a global optimization problem related to choosing the best instruction (and associated layout) for implementation of each operator in a complete DNN, and 3) SDA, an algorithm for packing instructions with consideration for soft dependencies. These solutions are incorporated in a complete compilation system that is extensively evaluated against other systems using 10 large DNN models. Evaluation results show that GCD2 outperforms two product-level state-of-the-art end-to-end DNN execution frameworks (TFLite and Qualcomm SNPE) that support mobile DSPs by up to $ 6.0 \times$ speedup, and outperforms three established compilers (Halide, TVM, and RAKE) by up to $4.5 \times, 3.4 \times$ and $4.0 \times$ speedup, respectively. GCD2 is also unique in supporting, real-time execution of certain DNNs, while its implementation enables two major DNNs to execute on a mobile DSP for the first time.}, journal={2022 55TH ANNUAL IEEE/ACM INTERNATIONAL SYMPOSIUM ON MICROARCHITECTURE (MICRO)}, author={Niu, Wei and Guan, Jiexiong and Shen, Xipeng and Wang, Yanzhi and Agrawal, Gagan and Ren, Bin}, year={2022}, pages={512–529} } @article{cicek_ning_ozturk_shen_2022, title={General Reuse-Centric CNN Accelerator}, volume={71}, ISSN={["1557-9956"]}, url={https://doi.org/10.1109/TC.2021.3064608}, DOI={10.1109/TC.2021.3064608}, abstractNote={This article introduces the first general reuse-centric accelerator for CNN inferences. Unlike prior work that exploits similarities only across consecutive video frames, general reuse-centric accelerator is able to discover similarities among arbitrary patches within an image or across independent images, and translate them into computation time and energy savings. Experiments show that the accelerator complements both prior software-based CNN and various CNN hardware accelerators, producing up to 14.96X speedups for similarity discovery, up to 2.70X speedups for overall inference.}, number={4}, journal={IEEE TRANSACTIONS ON COMPUTERS}, publisher={Institute of Electrical and Electronics Engineers (IEEE)}, author={Cicek, Nihat Mert and Ning, Lin and Ozturk, Ozcan and Shen, Xipeng}, year={2022}, month={Apr}, pages={880–891} } @article{young_nan_shen_2022, title={IDE Augmented with Human-Learning Inspired Natural Language Programming}, ISSN={["2574-1926"]}, DOI={10.1145/3510454.3516832}, abstractNote={Natural Language (NL) programming, the concept of synthesizing code from natural language inputs, has garnered growing interest among the software community in recent years. Unfortunately, current solutions in the space all suffer from the same problem, they require many labeled training examples due to their data-driven nature. To address this issue, this paper proposes an NLU-driven approach that forgoes the need for large numbers of labeled training examples. Inspired by how humans learn programming, this solution centers around Natural Language Understanding and draws on a novel graph-based mapping algorithm. The resulting NL programming framework, HISyn, uses no training examples, but gives synthesis accuracies comparable to data-driven methods trained on hundreds of samples. HISyn meanwhile demonstrates advantages in terms of interpretability, error diagnosis support, and cross-domain extensibility. To encourage adoption of HISyn among developers, the tool is made available as an extension for the Visual Studio Code IDE, thereby allowing users to easily submit inputs to HISyn and insert the generated code expressions into their active programs. A demo of the HISyn Extension can be found at https://youtu.be/KKOqJS24FNo.}, journal={2022 ACM/IEEE 44TH INTERNATIONAL CONFERENCE ON SOFTWARE ENGINEERING: COMPANION PROCEEDINGS (ICSE-COMPANION 2022)}, author={Young, Mitchell and Nan, Zifan and Shen, Xipeng}, year={2022}, pages={110–114} } @article{nan_dave_shen_liao_vanderbruggen_lin_emani_2022, title={Interactive NLU-Powered Ontology-Based Workflow Synthesis for FAIR Support of HPC}, DOI={10.1109/HUST56722.2022.00009}, abstractNote={Workflow synthesis is important for automatically creating the data processing workflow in a FAIR data management system for HPC. Previous methods are table-based, rigid and not scalable. This paper addresses these limitations by developing a new approach to workflow synthesis, interactive NLU-powered ontology-based workflow synthesis (INPOWS). IN-POWS allows the use of Natural Language for queries, maximizes the robustness in handling concepts and language ambiguities through an interactive ontology-based design, and achieves superior extensibility by adopting a synthesis algorithm powered by Natural Language Understanding. In our experiments, INPOWS shows the efficacy in enabling flexible, robust, and extensible workflow synthesis.}, journal={2022 IEEE/ACM INTERNATIONAL WORKSHOP ON HPC USER SUPPORT TOOLS (HUST)}, author={Nan, Zifan and Dave, Mithil and Shen, Xipeng and Liao, Chunhua and Vanderbruggen, Tristan and Lin, Pei-Hung and Emani, Murali}, year={2022}, pages={29–40} } @article{zhang_zhai_shen_mutlu_du_2022, title={POCLib: A High-Performance Framework for Enabling Near Orthogonal Processing on Compression}, volume={33}, ISSN={["1558-2183"]}, url={https://doi.org/10.1109/TPDS.2021.3093234}, DOI={10.1109/TPDS.2021.3093234}, abstractNote={Parallel technology boosts data processing in recent years, and parallel direct data processing on hierarchically compressed documents exhibits great promise. The high-performance direct data processing technique brings large savings in both time and space by removing the need for decompressing data. However, its benefits have been limited to data traversal operations; for random accesses, direct data processing is several times slower than the state-of-the-art baselines. This article proposes a novel concept, orthogonal processing on compression (orthogonal POC), which means that text analytics can be efficiently supported directly on compressed data, regardless of the type of the data processing – that is, the type of data processing is orthogonal to its capability of conducting POC. Previous proposals, such as TADOC, are not orthogonal POC. This article presents a set of techniques that successfully eliminate the limitation, and for the first time, establishes the near orthogonal POC feasibility of effectively handling both data traversal operations and random data accesses on hierarchically-compressed data. The work focuses on text data and yields a unified high-performance library, called POCLib. In a ten-node distributed Spark cluster on Amazon EC2, POCLib achieves 3.1× speedup over the state-of-the-art on random data accesses to compressed data, while preserving the capability of supporting traversal operations efficiently and providing large (3.9×) space savings.}, number={2}, journal={IEEE TRANSACTIONS ON PARALLEL AND DISTRIBUTED SYSTEMS}, author={Zhang, Feng and Zhai, Jidong and Shen, Xipeng and Mutlu, Onur and Du, Xiaoyong}, year={2022}, month={Feb}, pages={459–475} } @article{ye_xu_shen_jin_liao_solihin_2022, title={Preserving Addressability Upon GC-Triggered Data Movements on Non-Volatile Memory}, volume={19}, ISSN={["1544-3973"]}, DOI={10.1145/3511706}, abstractNote={This article points out an important threat that application-level Garbage Collection (GC) creates to the use of non-volatile memory (NVM). Data movements incurred by GC may invalidate the pointers to objects on NVM and, hence, harm the reusability of persistent data across executions. The article proposes the concept of movement-oblivious addressing (MOA), and develops and compares three novel solutions to materialize the concept for solving the addressability problem. It evaluates the designs on five benchmarks and a real-world application. The results demonstrate the promise of the proposed solutions, especially hardware-supported Multi-Level GPointer, in addressing the problem in a space- and time-efficient manner.}, number={2}, journal={ACM TRANSACTIONS ON ARCHITECTURE AND CODE OPTIMIZATION}, author={Ye, Chencheng and Xu, Yuanchao and Shen, Xipeng and Jin, Hai and Liao, Xiaofei and Solihin, Yan}, year={2022}, month={Jun} } @article{xia_shu_shen_menzies_2022, title={Sequential Model Optimization for Software Effort Estimation}, volume={48}, ISSN={["1939-3520"]}, url={https://doi.org/10.1109/TSE.2020.3047072}, DOI={10.1109/TSE.2020.3047072}, abstractNote={Many methods have been proposed to estimate how much effort is required to build and maintain software. Much of that research tries to recommend a single method – an approach that makes the dubious assumption that one method can handle the diversity of software project data. To address this drawback, we apply a configuration technique called “ROME” (Rapid Optimizing Methods for Estimation), which uses sequential model-based optimization (SMO) to find what configuration settings of effort estimation techniques work best for a particular data set. We test this method using data from 1161 traditional waterfall projects and 120 contemporary projects (from GitHub). In terms of magnitude of relative error and standardized accuracy, we find that ROME achieves better performance than the state-of-the-art methods for both traditional waterfall and contemporary projects. In addition, we conclude that we should not recommend one method for estimation. Rather, it is better to search through a wide range of different methods to find what works best for the local data. To the best of our knowledge, this is the largest effort estimation experiment yet attempted and the only one to test its methods on traditional waterfall and contemporary projects.}, number={6}, journal={IEEE TRANSACTIONS ON SOFTWARE ENGINEERING}, publisher={Institute of Electrical and Electronics Engineers (IEEE)}, author={Xia, Tianpei and Shu, Rui and Shen, Xipeng and Menzies, Tim}, year={2022}, month={Jun}, pages={1994–2009} } @article{xu_ye_shen_solihin_2022, title={Temporal Exposure Reduction Protection for Persistent Memory}, ISSN={["1530-0897"]}, DOI={10.1109/HPCA53966.2022.00071}, abstractNote={The long-living nature and byte-addressability of persistent memory (PM) amplifies the importance of strong memory protections. This paper develops temporal exposure reduction protection (TERP) as a framework for enforcing memory safety. Aiming to minimize the time when a PM region is accessible, TERP offers a complementary dimension of memory protection. The paper gives a formal definition of TERP, explores the semantics space of TERP constructs, and the relations with security and composability in both sequential and parallel executions. It proposes programming system and architecture solutions for the key challenges for the adoption of TERP, which draws on novel supports in both compilers and hardware to efficiently meet the exposure time target. Experiments validate the efficacy of the proposed support of TERP, in both efficiency and exposure time minimization.}, journal={2022 IEEE INTERNATIONAL SYMPOSIUM ON HIGH-PERFORMANCE COMPUTER ARCHITECTURE (HPCA 2022)}, author={Xu, Yuanchao and Ye, Chencheng and Shen, Xipeng and Solihin, Yan}, year={2022}, pages={908–924} } @article{sun_xie_shah_shen_2021, title={A Machine Learning Based Ensemble Forecasting Optimization Algorithm for Preseason Prediction of Atlantic Hurricane Activity}, volume={12}, ISSN={["2073-4433"]}, url={https://doi.org/10.3390/atmos12040522}, DOI={10.3390/atmos12040522}, abstractNote={In this study, nine different statistical models are constructed using different combinations of predictors, including models with and without projected predictors. Multiple machine learning (ML) techniques are employed to optimize the ensemble predictions by selecting the top performing ensemble members and determining the weights for each ensemble member. The ML-Optimized Ensemble (ML-OE) forecasts are evaluated against the Simple-Averaging Ensemble (SAE) forecasts. The results show that for the response variables that are predicted with significant skill by individual ensemble members and SAE, such as Atlantic tropical cyclone counts, the performance of SAE is comparable to the best ML-OE results. However, for response variables that are poorly modeled by individual ensemble members, such as Atlantic and Gulf of Mexico major hurricane counts, ML-OE predictions often show higher skill score than individual model forecasts and the SAE predictions. However, neither SAE nor ML-OE was able to improve the forecasts of the response variables when all models show consistent bias. The results also show that increasing the number of ensemble members does not necessarily lead to better ensemble forecasts. The best ensemble forecasts are from the optimally combined subset of models.}, number={4}, journal={ATMOSPHERE}, publisher={MDPI AG}, author={Sun, Xia and Xie, Lian and Shah, Shahil Umeshkumar and Shen, Xipeng}, year={2021}, month={Apr} } @article{guan_shen_krim_2021, title={An Automatic Synthesizer of Advising Tools for High Performance Computing}, volume={32}, ISSN={["1558-2183"]}, DOI={10.1109/TPDS.2020.3018636}, abstractNote={This article presents Egeria, the first automatic synthesizer of advising tools for High-Performance Computing (HPC). When one provides it with some HPC programming guides as inputs, Egeria automatically constructs a text retrieval tool that can advise on what to do to improve the performance of a given program. The advising tool provides a concise list of essential rules automatically extracted from the documents and can retrieve relevant optimization knowledge for optimization questions. Egeria is built based on a distinctive multi-layered design that leverages natural language processing (NLP) techniques and extends them with HPC-specific knowledge and considerations. This article presents the design, implementation, and both quantitative and qualitative evaluation results of Egeria.}, number={2}, journal={IEEE TRANSACTIONS ON PARALLEL AND DISTRIBUTED SYSTEMS}, author={Guan, Hui and Shen, Xipeng and Krim, Hamid}, year={2021}, month={Feb}, pages={330–341} } @article{zhao_niu_yuan_cai_sung_liu_liu_shen_ren_wang_et al._2021, title={Brief Industry Paper: Towards Real-Time 3D Object Detection for Autonomous Vehicles with Pruning Search}, ISSN={["1545-3421"]}, DOI={10.1109/RTAS52030.2021.00043}, abstractNote={In autonomous driving, 3D object detection is es-sential as it provides basic knowledge about the environment. However, as deep learning based 3D detection methods are usually computation intensive, it is challenging to support realtime 3D object detection on edge-computing devices in selfdriving cars with limited computation and memory resources. To facilitate this, we propose a compiler-aware pruning search framework, to achieve real-time inference of 3D object detection on the resource-limited mobile devices. Specifically, a generator is applied to sample better pruning proposals in the search space based on current proposals with their performance, and an evaluator is adopted to evaluate the sampled pruning proposal performance. To accelerate the search, the evaluator employs Bayesian optimization with an ensemble of neural predictors. We demonstrate in experiments that for the first time, the pruning search framework can achieve real-time 3D object detection on mobile (Samsung Galaxy S20 phone) with state-of-the-art detection performance.}, journal={2021 IEEE 27TH REAL-TIME AND EMBEDDED TECHNOLOGY AND APPLICATIONS SYMPOSIUM (RTAS 2021)}, author={Zhao, Pu and Niu, Wei and Yuan, Geng and Cai, Yuxuan and Sung, Hsin-Hsuan and Liu, Shaoshan and Liu, Sijia and Shen, Xipeng and Ren, Bin and Wang, Yanzhi and et al.}, year={2021}, pages={425–428} } @article{guan_liu_ma_niu_ren_shen_wang_zhao_2021, title={CoCoPIE: Enabling Real-Time AI on Off-the-Shelf Mobile Devices via Compression-Compilation Co-Design}, volume={64}, ISSN={["1557-7317"]}, DOI={10.1145/3418297}, abstractNote={A new framework allows intelligence on mainstream end devices without special hardware.}, number={6}, journal={COMMUNICATIONS OF THE ACM}, author={Guan, Hui and Liu, Shaoshan and Ma, Xiaolong and Niu, Wei and Ren, Bin and Shen, Xipeng and Wang, Yanzhi and Zhao, Pu}, year={2021}, month={Jun}, pages={62–68} } @article{shen_zhang_dea_andow_arroyo-fang_gafter_george_grueter_meijer_shivers_et al._2021, title={Coarsening Optimization for Differentiable Programming}, volume={5}, ISSN={["2475-1421"]}, url={https://doi.org/10.1145/3485507}, DOI={10.1145/3485507}, abstractNote={This paper presents a novel optimization for differentiable programming named coarsening optimization. It offers a systematic way to synergize symbolic differentiation and algorithmic differentiation (AD). Through it, the granularity of the computations differentiated by each step in AD can become much larger than a single operation, and hence lead to much reduced runtime computations and data allocations in AD. To circumvent the difficulties that control flow creates to symbolic differentiation in coarsening, this work introduces phi-calculus, a novel method to allow symbolic reasoning and differentiation of computations that involve branches and loops. It further avoids "expression swell" in symbolic differentiation and balance reuse and coarsening through the design of reuse-centric segment of interest identification. Experiments on a collection of real-world applications show that coarsening optimization is effective in speeding up AD, producing several times to two orders of magnitude speedups.}, number={OOPSLA}, journal={PROCEEDINGS OF THE ACM ON PROGRAMMING LANGUAGES-PACMPL}, publisher={Association for Computing Machinery (ACM)}, author={Shen, Xipeng and Zhang, Guoqiang and Dea, Irene and Andow, Samantha and Arroyo-Fang, Emilio and Gafter, Neal and George, Johann and Grueter, Melissa and Meijer, Erik and Shivers, Olin Grigsby and et al.}, year={2021}, month={Oct} } @article{zhang_pan_zhou_zhai_shen_mutlu_du_2021, title={G-TADOC: Enabling Efficient GPU-Based Text Analytics without Decompression}, ISSN={["1084-4627"]}, DOI={10.1109/ICDE51399.2021.00148}, abstractNote={Text analytics directly on compression (TADOC) has proven to be a promising technology for big data analytics. GPUs are extremely popular accelerators for data analytics systems. Unfortunately, no work so far shows how to utilize GPUs to accelerate TADOC. We describe G-TADOC, the first framework that provides GPU-based text analytics directly on compression, effectively enabling efficient text analytics on GPUs without decompressing the input data.G-TADOC solves three major challenges. First, TADOC involves a large amount of dependencies, which makes it difficult to exploit massive parallelism on a GPU. We develop a novel fine-grained thread-level workload scheduling strategy for GPU threads, which partitions heavily-dependent loads adaptively in a fine-grained manner. Second, in developing G-TADOC, thousands of GPU threads writing to the same result buffer leads to inconsistency while directly using locks and atomic operations lead to large synchronization overheads. We develop a memory pool with thread-safe data structures on GPUs to handle such difficulties. Third, maintaining the sequence information among words is essential for lossless compression. We design a sequence-support strategy, which maintains high GPU parallelism while ensuring sequence information.Our experimental evaluations show that G-TADOC provides 31.1× average speedup compared to state-of-the-art TADOC.}, journal={2021 IEEE 37TH INTERNATIONAL CONFERENCE ON DATA ENGINEERING (ICDE 2021)}, author={Zhang, Feng and Pan, Zaifeng and Zhou, Yanliang and Zhai, Jidong and Shen, Xipeng and Mutlu, Onur and Du, Xiaoyong}, year={2021}, pages={1679–1690} } @article{liao_lin_verma_vanderbruggen_emani_nan_shen_2021, title={HPC Ontology: Towards a Unified Ontology for Managing Training Datasets and AI Models for High-Performance Computing}, DOI={10.1109/MLHPC54614.2021.00012}, abstractNote={Machine learning (ML) techniques have been widely studied to address various challenges of productively and efficiently running large-scale scientific applications on heterogeneous supercomputers. However, it is extremely difficult to generate, access, and maintain training datasets and AI models to accelerate ML-based research. The Future of Research Communications and e-Scholarship has proposed the FAIR data principles describing Findability, Accessibility, Interoperability, and Reusability. In this paper, we present our ongoing work of designing an ontology for high-performance computing (named HPC ontology) in order to make training datasets and AI models FAIR. Our ontology provides controlled vocabularies, explicit semantics, and formal knowledge representations. Our design uses an extensible two-level pattern, capturing both high-level meta information and low-level data content for software, hardware, experiments, workflows, training datasets, AI models, and so on. Preliminary evaluation shows that HPC ontology is effective to annotate selected data and support a set of SPARQL queries.}, journal={PROCEEDINGS OF THE WORKSHOP ON MACHINE LEARNING IN HIGH PERFORMANCE COMPUTING ENVIRONMENTS (MLHPC 2021)}, author={Liao, Chunhua and Lin, Pei-Hung and Verma, Gaurav and Vanderbruggen, Tristan and Emani, Murali and Nan, Zifan and Shen, Xipeng}, year={2021}, pages={69–80} } @article{verma_emani_liao_lin_vanderbruggen_shen_chapman_2021, title={HPCFAIR: Enabling FAIR AI for HPC Applications}, DOI={10.1109/MLHPC54614.2021.00011}, abstractNote={Artificial Intelligence (AI) is being adopted in different domains at an unprecedented scale. A significant interest in the scientific community also involves leveraging machine learning (ML) to effectively run high performance computing applications at scale. Given multiple efforts in this arena, there are often duplicated efforts when existing rich data sets and ML models could be leveraged instead. The primary challenge is a lack of an ecosystem to reuse and reproduce the models and datasets. In this work, we propose HPCFAIR, a modular, extensible framework to enable AI models to be Findable, Accessible, Interoperable and Reproducible (FAIR). It enables users with a structured approach to search, load, save and reuse the models in their codes. We present the design, implementation of our framework and highlight how it can be seamlessly integrated to ML-driven applications for high performance computing applications and scientific machine learning workloads.}, journal={PROCEEDINGS OF THE WORKSHOP ON MACHINE LEARNING IN HIGH PERFORMANCE COMPUTING ENVIRONMENTS (MLHPC 2021)}, author={Verma, Gaurav and Emani, Murali and Liao, Chunhua and Lin, Pei-Hung and Vanderbruggen, Tristan and Shen, Xipeng and Chapman, Barbara}, year={2021}, pages={58–68} } @article{ye_xu_shen_liao_jin_solihin_2021, title={Hardware-Based Address-Centric Acceleration of Key-Value Store}, ISSN={["1530-0897"]}, DOI={10.1109/HPCA51647.2021.00067}, abstractNote={Efficiently retrieving data is essential for key-value store applications. A major part of the retrieving time is on data addressing, that is, finding the location of the value in memory that corresponds to a key. This paper introduces an address-centric approach to speed up the addressing by creating a shortcut for the translation of a key to the physical address of the value. The new technique is materialized with a novel in-memory table, STLT, a virtual-physical address buffer, and two new instructions. It creates a fast path for data addressing and meanwhile opens up opportunities for the use of simpler and faster hash tables to strike a better tradeoff between hashing conflicts and hashing overhead. Together, the new technique brings up to 1.4× speedups on key-value store application Redis and up to 13× speedups on some widely used indexing data structures, consistently outperforming prior solutions significantly.}, journal={2021 27TH IEEE INTERNATIONAL SYMPOSIUM ON HIGH-PERFORMANCE COMPUTER ARCHITECTURE (HPCA 2021)}, author={Ye, Chencheng and Xu, Yuanchao and Shen, Xipeng and Liao, Xiaofei and Jin, Hai and Solihin, Yan}, year={2021}, pages={736–748} } @article{guan_chaudhary_xu_ning_zhang_shen_2021, title={Recurrent Neural Networks Meet Context-Free Grammar: Two Birds with One Stone}, ISSN={["1550-4786"]}, DOI={10.1109/ICDM51629.2021.00125}, abstractNote={Recurrent Neural Networks (RNN) are widely used for various prediction tasks on sequences such as text, speed signals, program traces, and system logs. Due to RNNs’ inherently sequential behavior, one key challenge for the effective adoption of RNNs is to reduce the time spent on RNN inference and to increase the scope of a prediction. This work introduces CFG-guided compressed learning, an approach that creatively integrates Context-Free Grammar (CFG) and online tokenization into RNN learning and inference for streaming inputs. Through a hierarchical compression algorithm, it compresses an input sequence to a CFG and makes predictions based on the compressed sequence. Its algorithm design employs a set of techniques to overcome the issues from the myopic nature of online tokenization, the tension between inference accuracy and compression rate, and other complexities. Experiments on 16 real-world sequences of various types validate that the proposed compressed learning can successfully recognize and leverage repetitive patterns in input sequences, and effectively translate them into dramatic (1-1762×) inference speedups as well as much (1-7830×) expanded prediction scope, while keeping the inference accuracy satisfactory.}, journal={2021 21ST IEEE INTERNATIONAL CONFERENCE ON DATA MINING (ICDM 2021)}, author={Guan, Hui and Chaudhary, Umang and Xu, Yuanchao and Ning, Lin and Zhang, Lijun and Shen, Xipeng}, year={2021}, pages={1078–1083} } @article{zhang_guan_ding_shen_krim_2021, title={Reuse-centric k-means configuration}, volume={100}, ISSN={["1873-6076"]}, url={https://doi.org/10.1016/j.is.2021.101787}, DOI={10.1016/j.is.2021.101787}, abstractNote={K-means configuration is to find a configuration of k-means (e.g., the number of clusters, feature sets) that maximize some objectives. It is a time-consuming process due to the iterative nature of k-means. This paper proposes reuse-centric k-means configuration to accelerate k-means configuration. It is based on the observation that the explorations of different configurations share lots of common or similar computations. Effectively reusing the computations from prior trials of different configurations could largely shorten the configuration time. To materialize the idea, the paper presents a set of novel techniques, including reuse-based filtering, center reuse, and a two-phase design to capitalize on the reuse opportunities on three levels: validation, number of clusters, and feature sets. Experiments on k-means–based data classification tasks show that reuse-centric k-means configuration can speed up a heuristic search-based configuration process by a factor of 5.8, and a uniform search-based attainment of classification error surfaces by a factor of 9.1. The paper meanwhile provides some important insights on how to effectively apply the acceleration techniques to tap into a full potential.}, journal={INFORMATION SYSTEMS}, author={Zhang, Lijun and Guan, Hui and Ding, Yufei and Shen, Xipeng and Krim, Hamid}, year={2021}, month={Sep} } @article{yang_shen_lim_2021, title={Revisit the Scalability of Deep Auto-Regressive Models for Graph Generation}, ISSN={["2161-4393"]}, DOI={10.1109/IJCNN52387.2021.9534206}, abstractNote={As a new promising approach to graph generations, deep auto-regressive graph generation has drawn increasing attention. It however has been commonly deemed as hard to scale up to work with large graphs. In existing studies, it is perceived that the consideration of the full non-local graph dependences is indispensable for this approach to work, which entails the needs for keeping the entire graph's info in memory and hence the perceived “inherent” scalability limitation of the approach. This paper revisits the common perception. It proposes three ways to relax the dependences and conducts a series of empirical measurements. It concludes that the perceived “inherent” scalability limitation is a misperception; with the right design and implementation, deep auto-regressive graph generation can be applied to graphs much larger than the device memory. The rectified perception removes a fundamental barrier for this approach to meet practical needs.}, journal={2021 INTERNATIONAL JOINT CONFERENCE ON NEURAL NETWORKS (IJCNN)}, author={Yang, Shuai and Shen, Xipeng and Lim, Seung-Hwan}, year={2021} } @article{ul mustafa_xu_shen_solihin_2021, title={Seeds of SEED: New Security Challenges for Persistent Memory}, DOI={10.1109/SEED51797.2021.00020}, abstractNote={Persistent Memeory Object (PMO) is a general system abstraction for holding persistent data in persistent main memory, managed by an operating system. PMO programming model breaks inter-process isolation as it results in sharing of persistent data between two processes as they alternatively access the same PMO. The uncoordinated data-access opens a new avenue for cross-run and cross-process security attacks.In this paper, we discuss threat vulnerabilities that are either new or increased in intensity under PMO programming model. We also discuss security implications of using the PMO, highlighting sample PMO-based attacks and potential strategies to defend against them.}, journal={2021 INTERNATIONAL SYMPOSIUM ON SECURE AND PRIVATE EXECUTION ENVIRONMENT DESIGN (SEED 2021)}, author={Ul Mustafa, Naveed and Xu, Yuanchao and Shen, Xipeng and Solihin, Yan}, year={2021}, pages={83–88} } @article{agrawal_yang_agrawal_yedida_shen_menzies_2021, title={Simpler Hyperparameter Optimization for Software Analytics: Why, How, When}, volume={48}, ISSN={0098-5589 1939-3520 2326-3881}, url={http://dx.doi.org/10.1109/TSE.2021.3073242}, DOI={10.1109/TSE.2021.3073242}, abstractNote={How can we make software analytics simpler and faster? One method is to match the complexity of analysis to the intrinsic complexity of the data being explored. For example, hyperparameter optimizers find the control settings for data miners that improve the predictions generated via software analytics. Sometimes, very fast hyperparameter optimization can be achieved by “DODGE-ing”; i.e., simply steering way from settings that lead to similar conclusions. But when is it wise to use that simple approach and when must we use more complex (and much slower) optimizers? To answer this, we applied hyperparameter optimization to 120 SE data sets that explored bad smell detection, predicting Github issue close time, bug report analysis, defect prediction, and dozens of other non-SE problems. We find that the simple DODGE works best for data sets with low “intrinsic dimensionality” ($\mu _D\approx 3$μD3) and very poorly for higher-dimensional data ($\mu _D > 8$μD>8). Nearly all the SE data seen here was intrinsically low-dimensional, indicating that DODGE is applicable for many SE analytics tasks.}, number={8}, journal={IEEE Transactions on Software Engineering}, publisher={Institute of Electrical and Electronics Engineers (IEEE)}, author={Agrawal, Amritanshu and Yang, Xueqi and Agrawal, Rishabh and Yedida, Rahul and Shen, Xipeng and Menzies, Tim}, year={2021}, pages={1–1} } @article{ye_xu_shen_liao_jin_solihin_2021, title={Supporting Legacy Libraries on Non-Volatile Memory: A User-Transparent Approach}, ISSN={["1063-6897"]}, DOI={10.1109/ISCA52012.2021.00042}, abstractNote={As mainstream computing is poised to embrace the advent of byte-addressable non-volatile memory (NVM), an important roadblock has remained largely unnoticed, support of legacy libraries on NVM. Libraries underpin modern software everywhere. As current NVM programming interfaces all designate special types and constructs for NVM objects and references, legacy libraries, being incompatible with these data types, will face major obstacles for working with future applications written for NVM. This paper introduces a simple approach to mitigating the issue. The novel approach centers around user-transparent persistent reference, a new concept that allows programmers to reference a persistent object in the same way as reference a normal (volatile) object. The paper presents the implementation of the concept, carefully examines its soundness, and describes compiler and simple architecture support for keeping performance overheads very low.}, journal={2021 ACM/IEEE 48TH ANNUAL INTERNATIONAL SYMPOSIUM ON COMPUTER ARCHITECTURE (ISCA 2021)}, author={Ye, Chencheng and Xu, Yuanchao and Shen, Xipeng and Liao, Xiaofei and Jin, Hai and Solihin, Yan}, year={2021}, pages={443–455} } @article{zhang_zhai_shen_wang_chen_mutlu_chen_du_2021, title={TADOC: Text analytics directly on compression}, volume={30}, ISSN={["0949-877X"]}, DOI={10.1007/s00778-020-00636-3}, abstractNote={This article provides a comprehensive description of Text Analytics Directly on Compression (TADOC), which enables direct document analytics on compressed textual data. The article explains the concept of TADOC and the challenges to its effective realizations. Additionally, a series of guidelines and technical solutions that effectively address those challenges, including the adoption of a hierarchical compression method and a set of novel algorithms and data structure designs, are presented. Experiments on six data analytics tasks of various complexities show that TADOC can save 90.8% storage space and 87.9% memory usage, while halving data processing times.}, number={2}, journal={VLDB JOURNAL}, author={Zhang, Feng and Zhai, Jidong and Shen, Xipeng and Wang, Dalin and Chen, Zheng and Mutlu, Onur and Chen, Wenguang and Du, Xiaoyong}, year={2021}, month={Mar}, pages={163–188} } @article{tan_chen_liu_ren_song_shen_liu_2021, title={Toward Efficient Interactions between Python and Native Libraries}, DOI={10.1145/3468264.3468541}, abstractNote={Python has become a popular programming language because of its excellent programmability. Many modern software packages utilize Python for high-level algorithm design and depend on native libraries written in C/C++/Fortran for efficient computation kernels. Interaction between Python code and native libraries introduces performance losses because of the abstraction lying on the boundary of Python and native libraries. On the one side, Python code, typically run with interpretation, is disjoint from its execution behavior. On the other side, native libraries do not include program semantics to understand algorithm defects. To understand the interaction inefficiencies, we extensively study a large collection of Python software packages and categorize them according to the root causes of inefficiencies. We extract two inefficiency patterns that are common in interaction inefficiencies. Based on these patterns, we develop PieProf, a lightweight profiler, to pinpoint interaction inefficiencies in Python applications. The principle of PieProf is to measure the inefficiencies in the native execution and associate inefficiencies with high-level Python code to provide a holistic view. Guided by PieProf, we optimize 17 real-world applications, yielding speedups up to 6.3× on application level.}, journal={PROCEEDINGS OF THE 29TH ACM JOINT MEETING ON EUROPEAN SOFTWARE ENGINEERING CONFERENCE AND SYMPOSIUM ON THE FOUNDATIONS OF SOFTWARE ENGINEERING (ESEC/FSE '21)}, author={Tan, Jialiang and Chen, Yu and Liu, Zhenming and Ren, Bin and Song, Shuaiwen Leon and Shen, Xipeng and Liu, Xu}, year={2021}, pages={1117–1128} } @article{zhang_xu_shen_dillig_2021, title={UDF to SQL Translation through Compositional Lazy Inductive Synthesis}, volume={5}, ISSN={["2475-1421"]}, url={https://doi.org/10.1145/3485489}, DOI={10.1145/3485489}, abstractNote={ Many data processing systems allow SQL queries that call user-defined functions (UDFs) written in conventional programming languages. While such SQL extensions provide convenience and flexibility to users, queries involving UDFs are not as efficient as their pure SQL counterparts that invoke SQL’s highly-optimized built-in functions. Motivated by this problem, we propose a new technique for translating SQL queries with UDFs to pure SQL expressions. Unlike prior work in this space, our method is not based on syntactic rewrite rules and can handle a much more general class of UDFs. At a high-level, our method is based on counterexample-guided inductive synthesis (CEGIS) but employs a novel compositional strategy that decomposes the synthesis task into simpler sub-problems. However, because there is no universal decomposition strategy that works for all UDFs, we propose a novel lazy inductive synthesis approach that generates a sequence of decompositions that correspond to increasingly harder inductive synthesis problems. Because most realistic UDF-to-SQL translation tasks are amenable to a fine-grained decomposition strategy, our lazy inductive synthesis method scales significantly better than traditional CEGIS. }, number={OOPSLA}, journal={PROCEEDINGS OF THE ACM ON PROGRAMMING LANGUAGES-PACMPL}, publisher={Association for Computing Machinery (ACM)}, author={Zhang, Guoqiang and Xu, Yuanchao and Shen, Xipeng and Dillig, Isil}, year={2021}, month={Oct} } @article{li_zhang_shen_2020, title={DIAC An Inter-app Conflicts Detector for Open IoT Systems}, volume={19}, ISSN={["1558-3465"]}, DOI={10.1145/3391895}, abstractNote={This article tackles the problem of detecting and solving potential conflicts among independently developed apps that are to be installed into an open Internet-of-Things (IoT) environment. It provides a new set of definitions and categorizations of the conflicts to more precisely characterize the nature of the problem, and it proposes a representation named “IA Graphs” for formally representing IoT controls and inter-app interplays. Based on the definitions and representation, it then describes an efficient conflict detection algorithm. Combining conflict categories, seriousness indicator, and conflict frequency, an innovative solution policy for solving various detected conflicts is developed, which also takes into account user preference and interest by providing interactive process. It implements a compiler and runtime software system that integrates all the proposed techniques together into a comprehensive solution. Experiments on SmartThings apps validate its significantly better detection efficacy over prior methods and effectiveness of conflict solution with user preference.}, number={6}, journal={ACM TRANSACTIONS ON EMBEDDED COMPUTING SYSTEMS}, author={Li, Xinyi and Zhang, Lei and Shen, Xipeng}, year={2020}, month={Nov} } @inproceedings{zhang_zhai_shen_mutlu_du_2020, title={Enabling Efficient Random Access to Hierarchically-Compressed Data}, ISSN={["1084-4627"]}, DOI={10.1109/ICDE48307.2020.00097}, abstractNote={Recent studies have shown the promise of direct data processing on hierarchically-compressed text documents. By removing the need for decompressing data, the direct data processing technique brings large savings in both time and space. However, its benefits have been limited to data traversal operations; for random accesses, direct data processing is several times slower than the state-of-the-art baselines. This paper presents a set of techniques that successfully eliminate the limitation, and for the first time, establishes the feasibility of effectively handling both data traversal operations and random data accesses on hierarchically-compressed data. The work yields a new library, which achieves 3.1 × speedup over the state-of-the-art on random data accesses to compressed data, while preserving the capability of supporting traversal operations efficiently and providing large (3.9 ×) space savings.}, booktitle={2020 IEEE 36th International Conference on Data Engineering (ICDE)}, author={Zhang, Feng and Zhai, Jidong and Shen, Xipeng and Mutlu, Onur and Du, Xiaoyong}, year={2020}, pages={1069–1080} } @article{zhou_zhao_shen_chen_2020, title={Enabling Runtime SpMV Format Selection through an Overhead Conscious Method}, volume={31}, DOI={10.1109/TPDS.2019.2932931}, abstractNote={Sparse matrix-vector multiplication (SpMV) is an important kernel and its performance is critical for many applications. Storage format selection is to select the best format to store a sparse matrix; it is essential for SpMV performance. Prior studies have focused on predicting the format that helps SpMV run fastest, but have ignored the runtime prediction and format conversion overhead. This work shows that the runtime overhead makes the predictions from previous solutions frequently sub-optimal and sometimes inferior regarding the end-to-end time. It proposes a new paradigm for SpMV storage selection, an overhead-conscious method. Through carefully designed regression models and neural network-based time series prediction models, the method captures the influence imposed on the overall program performance by the overhead and the benefits of format prediction and conversions. The method employs a novel two-stage lazy-and-light scheme to help control the possible negative effects of format predictions, and at the same time, maximize the overall format conversion benefits. Experiments show that the technique outperforms previous techniques significantly. It improves the overall performance of applications by 1.21X to 1.53X, significantly larger than the 0.83X to 1.25X upper-bound speedups overhead-oblivious methods could give.}, number={1}, journal={IEEE Transactions on Parallel and Distributed Systems}, author={Zhou, Weijie and Zhao, Yue and Shen, Xipeng and Chen, Wang}, year={2020}, month={Jan}, pages={80–93} } @article{oh_zheng_shen_zhai_yi_2020, title={GOPipe: A Granularity-Oblivious Programming Framework for Pipelined Stencil Executions on GPU}, ISSN={["1089-795X"]}, DOI={10.1145/3410463.3414656}, abstractNote={Recent studies have shown promising performance benefits when multiple stages of a pipelined stencil application are mapped to different parts of a GPU to run concurrently. An important factor for the computing efficiency of such pipelines is the granularity of a task. In previous programming frameworks that support true pipelined computations on GPU, the choice has to be made by the programmers during the application development time. Due to many difficulties, programmers' decisions are often far from optimal, causing inferior performance and performance portability. This paper presents GOPipe, a granularity-oblivious programming framework for efficient pipelined stencil executions on GPU. With GOPipe, programmers no longer need to specify the appropriate task granularity. GOPipe automatically finds it, and dynamically schedules tasks of that granularity for efficiency while observing all inter-task and inter-stage data dependencies. In our experiments on six real-life applications and various scenarios, GOPipe outperforms the state-of-the-art system by 1.39X on average with a much better programming productivity.}, journal={PACT '20: PROCEEDINGS OF THE ACM INTERNATIONAL CONFERENCE ON PARALLEL ARCHITECTURES AND COMPILATION TECHNIQUES}, author={Oh, Chanyoung and Zheng, Zhen and Shen, Xipeng and Zhai, Jidong and Yi, Youngmin}, year={2020}, pages={43–54} } @article{zhou_zhao_zhang_shen_2020, title={HARP: Holistic Analysis for Refactoring Python-Based Analytics Programs}, ISSN={["0270-5257"]}, DOI={10.1145/3377811.3380434}, abstractNote={Modern machine learning programs are often written in Python, with the main computations specified through calls to some highly optimized libraries (e.g., TensorFlow, PyTorch). How to maximize the computing efficiency of such programs is essential for many application domains, which has drawn lots of recent attention. This work points out a common limitation in existing efforts: they focus their views only on the static computation graphs specified by library APIs, but leave the influence from the hosting Python code largely unconsidered. The limitation often causes them to miss the big picture and hence many important optimization opportunities. This work proposes a new approach named HARP to address the problem. HARP enables holistic analysis that spans across computation graphs and their hosting Python code. HARP achieves it through a set of novel techniques: analytics-conscious speculative analysis to circumvent Python complexities, a unified representation augmented computation graphs to capture all dimensions of knowledge related with the holistic analysis, and conditioned feedback mechanism to allow risk-controlled aggressive analysis. Refactoring based on HARP gives 1.3-3X and 2.07X average speedups on a set of TensorFlow and PyTorch programs.}, journal={2020 ACM/IEEE 42ND INTERNATIONAL CONFERENCE ON SOFTWARE ENGINEERING (ICSE 2020)}, author={Zhou, Weijie and Zhao, Yue and Zhang, Guoqiang and Shen, Xipeng}, year={2020}, pages={506–517} } @article{xu_ye_solihin_shen_2020, title={Hardware-Based Domain Virtualization for Intra-Process Isolation of Persistent Memory Objects}, ISSN={["0884-7495"]}, DOI={10.1109/ISCA45697.2020.00062}, abstractNote={Persistent memory has appealing properties in serving as main memory. While file access is protected by system calls, an attached persistent memory object (PMO) is one load/store away from accidental (or malicious) reads or writes, which may arise from use of just one buggy library. The recent progress in intra-process isolation could potentially protect PMO by enabling a process to partition sensitive data and code into isolated components. However, the existing intra-process isolations (e.g., Intel MPK) support isolation of only up to 16 domains, forming a major barrier for PMO protections. Although there is some recent effort trying to virtualize MPK to circumvent the limit, it suffers large overhead. This paper presents two novel architecture supports, which provide 11 - 52 × higher efficiency while offering the first known domain-based protection for PMOs.}, journal={2020 ACM/IEEE 47TH ANNUAL INTERNATIONAL SYMPOSIUM ON COMPUTER ARCHITECTURE (ISCA 2020)}, author={Xu, Yuanchao and Ye, ChenCheng and Solihin, Yan and Shen, Xipeng}, year={2020}, pages={680–692} } @article{xu_solihin_shen_2020, title={MERR: Improving Security of Persistent Memory Objects via Efficient Memory Exposure Reduction and Randomization}, DOI={10.1145/3373376.3378492}, abstractNote={This paper proposes a new defensive technique for memory, especially useful for long-living objects on Non-Volatile Memory (NVM), or called Persistent Memory objects (PMOs). The method takes a distinctive perspective, trying to reduce memory exposure time by largely shortening the overhead in attaching and detaching PMOs into the memory space. It does it through a novel idea, embedding page table subtrees inside PMOs. The paper discusses the complexities the technique brings, to permission controls and hardware implementations, and provides solutions. Experimental results show that the new technique reduces memory exposure time by 60% with a 5% time overhead (70% with 10.9% overhead). It allows much more frequent address randomizations (shortening the period from seconds to less than 41.4us), offering significant potential for enhancing memory security.}, journal={TWENTY-FIFTH INTERNATIONAL CONFERENCE ON ARCHITECTURAL SUPPORT FOR PROGRAMMING LANGUAGES AND OPERATING SYSTEMS (ASPLOS XXV)}, author={Xu, Yuanchao and Solihin, Yan and Shen, Xipeng}, year={2020}, pages={987–1000} } @article{jin_shen_lovas_liao_2020, title={Special Issue: Graph Computing}, volume={32}, ISSN={["1532-0634"]}, DOI={10.1002/cpe.5452}, abstractNote={Graph computing now is popular in many areas, including social network and gene sequence alignment. Graph computing system and algorithm have a history prior to the use of graph databases and have a future that is not necessarily entangled with typical database concerns. With the data's increasing size, many distributed graph-computing systems have been developed in recent years to process and analyze massive graphs. Researchers pay more attention on the graph partition schemes on distributed environment. However, other researchers think a single system can avoid the network overhead and may have better performance even if the data size is too big for the memory space. With the rapid development of coprocessors, some researchers think it is promising to build a domain specific computer, just for graph computing. The proposed special issue of Concurrency and Computation: Practice and Experience contains revised and extended versions of selected best papers with respect to graph computing at the 21st IEEE International Conference on Parallel and Distributed Systems (ICPADS’16), which was held at Wuhan, China, on December 13-16, 2016. Established in 1992, ICPADS has been a major international forum for scientists, engineers, and users to exchange and share their experiences, new ideas, and latest research results on all aspects of parallel and distributed computing systems. The purpose of this special issue is to provide a comprehensive view into recent advances in systems software, algorithms, partition schemes, and even graph computer based on new advances in computer architecture and applications. The five selected papers are summarized as follows. The first paper, titled ‘‘An efficient iterative graph data processing framework based on bulk synchronous parallel model’’ by Liu et al,1 presents an efficient computational framework for graph data processing based on the bulk synchronous parallel model. Existing Pregel-like graph processing systems remains in its early stage, and there still exist many challenges with prohibitive superstep-synchronized overhead. Furthermore, the graph data partition strategy in these earlier graph systems fails to support load balancing, therefore causing the increase of network I/O overhead as the scale of graph data grows. Thus, this paper leverages a global synchronization mechanism to enhance the performance of graph computation. Meanwhile, a balanced hash-based graph partition mechanism is presented to optimize the large-scale graph data processing. The work has a real implementation upon on Pregrel system, which can better support a variety of graph analytics applications. The second paper, titled ‘‘An efficient iterative graph data processing framework based on bulk synchronous parallel model’’ by Linchen Yu,2 proposes an optimized scheduling system for parallelizing the programs in the Xen. Virtualization challenges the traditional CPU scheduling, leading that the spin lock in virtualized environment can be preempted by the VMM, increasing synchronization overhead and decreasing the performance of parallel programs. Many studies have proposed the co-scheduling to alleviate this problem. However, these earlier attempts are not suitable to non-parallel workloads with the CPU fragmentation problem as well. Therefore, a simultaneous optimization scheduling system, called CCHybrid, is proposed in the Xen virtualized environment. Results show the efficiency of CCHybrid over the traditional Xen Credit scheduler. The third paper, titled ‘‘ms-PoSW: A multi-server aided proof of shared ownership scheme for secure deduplication in cloud’’ by Xiong et al,3 introduces a novel concept of the Proof for securing client-side deduplication of the shared files. With the rapid development of cloud computing and big data technologies, collaborative cloud applications are inextricably linked to our daily life and, therefore, produce a large number of shared files, which is challenging for secure access and data duplication in cloud. This paper proposes a novel multiserver-aided PoSW scheme for collaborative cloud applications and propose a hybrid PoSW scheme to reduce the computational cost of the shared owner's client. Furthermore, a hybrid PoSW scheme is constructed to address the secure proof of hybrid cloud architectures. The fourth paper, titled ‘‘Sparse random compressive sensing based data aggregation in wireless sensor networks’’ by Yin et al,4 introduces a compressive data aggregation scheme. In wireless sensor networks, the increasingly expanding data volume has high spatial-temporal correlation. Although some earlier studies attempt to eliminate data redundancy, few can handle energy consumption and latency simultaneously. In this paper, the authors a delay-minimum energy-balanced data aggregation method, which can eliminate the redundancy among the readings and prolong the network lifetime. A sparse random matrix is adopted as a measurement matrix to balance communication cost. Particularly, each measurement can form an aggregation tree with minimum delay. Furthermore, a novel scheduling method is used to avoid information interference as well. The fifth paper, titled ‘‘Dynamic cluster strategy for hierarchical rollback-recovery protocols in MPI HPC applications’’ by Liao et al,5 proposes a dynamic cluster strategy to adapt to the runtime variation of communication pattern by using a prediction scheme. The idea comes from a fact that Hierarchical rollback-recovery protocols provide failure containment and reduce the amount of message to be logged, making it an attractive and scalable solution for fault tolerance even at a large scale. This paper shows how the communication pattern changes with the stages}, number={3}, journal={CONCURRENCY AND COMPUTATION-PRACTICE & EXPERIENCE}, author={Jin, Hai and Shen, Xipeng and Lovas, Robert and Liao, Xiaofei}, year={2020}, month={Feb} } @article{ning_guan_shen_2019, title={Adaptive Deep Reuse: Accelerating CNN Training on the Fly}, ISSN={["1084-4627"]}, DOI={10.1109/ICDE.2019.00138}, abstractNote={This work proposes adaptive deep reuse, a method for accelerating CNN training by identifying and avoiding the unnecessary computations contained in each specific training on the fly. It makes two-fold major contributions. (1) It empirically proves the existence of a lot of similarities among neuron vectors in both forward and backward propagation of CNN. (2) It introduces the first adaptive strategy for translating the similarities into computation reuse in CNN training. The strategy adaptively adjusts the strength of reuse based on the different tolerance of precision relaxation in different CNN training stages. Experiments show that adaptive deep reuse saves 69% CNN training time with no accuracy loss.}, journal={2019 IEEE 35TH INTERNATIONAL CONFERENCE ON DATA ENGINEERING (ICDE 2019)}, author={Ning, Lin and Guan, Hui and Shen, Xipeng}, year={2019}, pages={1538–1549} } @inproceedings{ning_shen_2019, title={Deep reuse}, ISBN={9781450360791}, url={http://dx.doi.org/10.1145/3330345.3330384}, DOI={10.1145/3330345.3330384}, abstractNote={This paper presents deep reuse, a method for speeding up CNN inferences by detecting and exploiting deep reusable computations on the fly. It empirically reveals the massive similarities among neuron vectors in activation maps, both within CNN inferences on an input and across inputs. It gives an in-depth study on how to effectively turn the similarities into beneficial computation reuse to speed up CNN inferences. The investigation covers various factors, ranging from the clustering methods for similarity detection, to clustering scopes, similarity metrics, and neuron vector granularities. The insights help create deep reuse. As an on-line method, deep reuse is easy to apply, and adapts to each CNN (compressed or not) and its input. Using no special hardware support or CNN model changes, this method speeds up inferences by 1.77--2X (up to 4.3X layer-wise) on the fly with virtually no (}, booktitle={Proceedings of the ACM International Conference on Supercomputing - ICS '19}, publisher={ACM Press}, author={Ning, Lin and Shen, Xipeng}, year={2019} } @inproceedings{zheng_oh_zhai_shen_yi_chen_2019, title={HiWayLib}, ISBN={9781450362405}, url={http://dx.doi.org/10.1145/3297858.3304032}, DOI={10.1145/3297858.3304032}, abstractNote={Pipeline is a parallel computing model underpinning a class of important applications running on CPU-GPU heterogeneous systems. A critical aspect for the efficiency of such applications is the support of communications among pipeline stages that may reside on CPU and different parts of a GPU. Existing libraries of concurrent data structures do not meet the needs, due to the massive parallelism on GPU and the complexities in CPU-GPU memory and connections. This work gives an in-depth study on the communication problem. It identifies three key issues, namely, slow and error-prone detection of the end of pipeline processing, intensive queue contentions on GPU, and cumbersome inter-device data movements. This work offers solutions to each of the issues, and integrates all together to form a unified library named HiWayLib. Experiments show that HiWayLib significantly boosts the efficiency of pipeline communications in CPU-GPU heterogeneous applications. For real-world applications, HiWayLib produces 1.22~2.13× speedups over the state-of-art implementations with little extra programming effort required.}, booktitle={Proceedings of the Twenty-Fourth International Conference on Architectural Support for Programming Languages and Operating Systems - ASPLOS '19}, publisher={ACM Press}, author={Zheng, Zhen and Oh, Chanyoung and Zhai, Jidong and Shen, Xipeng and Yi, Youngmin and Chen, Wenguang}, year={2019} } @article{agrawal_fu_chen_shen_menzies_2019, title={How to "DODGE" Complex Software Analytics}, volume={47}, ISSN={0098-5589 1939-3520 2326-3881}, url={http://dx.doi.org/10.1109/tse.2019.2945020}, DOI={10.1109/TSE.2019.2945020}, abstractNote={Machine learning techniques applied to software engineering tasks can be improved by hyperparameter optimization, i.e., automatic tools that find good settings for a learner's control parameters. We show that such hyperparameter optimization can be unnecessarily slow, particularly when the optimizers waste time exploring “redundant tunings”, i.e., pairs of tunings which lead to indistinguishable results. By ignoring redundant tunings, DODGE($\mathcal {E})$E), a tuning tool, runs orders of magnitude faster, while also generating learners with more accurate predictions than seen in prior state-of-the-art approaches.}, number={10}, journal={IEEE Transactions on Software Engineering}, publisher={Institute of Electrical and Electronics Engineers (IEEE)}, author={Agrawal, Amritanshu and Fu, Wei and Chen, Di and Shen, Xipeng and Menzies, Tim}, year={2019}, pages={1–1} } @inproceedings{li_zhang_shen_2019, title={IA-graph based inter-app conflicts detection in open IoT systems}, ISBN={9781450367240}, url={http://dx.doi.org/10.1145/3316482.3326350}, DOI={10.1145/3316482.3326350}, abstractNote={This paper tackles the problem of detecting potential conflicts among independently developed apps that are to be installed into an open Internet of Things (IoT) environment. It provides a new set of definitions and categorizations of the conflicts to more precisely characterize the nature of the problem, and employs a graph representation (named IA Graph) for formally representing IoT controls and inter-app interplays. It provides an efficient conflicts detection algorithm implemented on a SmartThings compiler and shows significantly improved efficacy over prior solutions.}, booktitle={Proceedings of the 20th ACM SIGPLAN/SIGBED International Conference on Languages, Compilers, and Tools for Embedded Systems - LCTES 2019}, publisher={ACM Press}, author={Li, Xinyi and Zhang, Lei and Shen, Xipeng}, year={2019} } @inproceedings{guan_ning_lin_shen_zhou_lim_2019, title={In-Place Zero-Space Memory Protection for CNN}, booktitle={Advances in Neural Information Processing Systems Proceedings}, author={Guan, Hui and Ning, Lin and Lin, Zhen and Shen, Xipeng and Zhou, Huiyang and Lim, Seung-Hwan}, editor={Wallach, H. and Larochelle, H. and Beygelzimer, A. and d'Alché-Buc, F. and Fox, E. and Garnett, R.Editors}, year={2019} } @article{oh_zheng_shen_zhai_yi_2019, title={POSTER: GOPipe: A Granularity-Oblivious Programming Framework for Pipelined Stencil Executions on GPU}, DOI={10.1145/3293883.3301494}, abstractNote={Recent studies have shown promising performance benefits of pipelined stencil applications. An important factor for the computing efficiency of such pipelines is the granularity of a task. We presents GOPipe, the first granularity-oblivious programming framework for efficient pipelined stencil executions. With GOPipe, programmers no longer need to specify the appropriate task granularity. GOPipe automatically finds it, and schedules tasks of that granularity while observing all inter-task and inter-stage data dependencies. In our experiments on four real-life applications, GOPipe outperforms the state-of-the-art by up to 4.57× with a much better programming productivity.}, journal={PROCEEDINGS OF THE 24TH SYMPOSIUM ON PRINCIPLES AND PRACTICE OF PARALLEL PROGRAMMING (PPOPP '19)}, author={Oh, Chanyoung and Zheng, Zhen and Shen, Xipeng and Zhai, Jidong and Yi, Youngmin}, year={2019}, pages={431–432} } @inproceedings{yang_shen_chi_2019, title={Streamline Density Peak Clustering for Practical Adoptions}, ISBN={9781450369763}, url={http://dx.doi.org/10.1145/3357384.3358053}, DOI={10.1145/3357384.3358053}, abstractNote={Since Density Peak Clustering (DPC) algorithm was proposed in 2014, it has drawn lots of interest in various domains. As a clustering method, DPC features superior generality, robustness, flexibility and simplicity. There are however two main roadblocks for its practical adoptions, both centered around the selection of cutoff distance, the single critical hyperparameter of DPC. This work proposes an improved algorithm named Streamlined Density Peak Clustering (SDPC). SDPC speeds up DPC executions on a sequence of cutoff distances by 2.2-8.8X while at the same time reducing memory usage by a magnitude. As an algorithm preserving the original semantic of DPC, SDPC offers an efficient and scalable drop-in replacement of DPC for data clustering.}, booktitle={Proceedings of the 28th ACM International Conference on Information and Knowledge Management - CIKM '19}, publisher={ACM Press}, author={Yang, Shuai and Shen, Xipeng and Chi, Min}, year={2019} } @inproceedings{guan_shen_lim_2019, title={Wootz: a compiler-based framework for fast CNN pruning via composability}, ISBN={9781450367127}, url={http://dx.doi.org/10.1145/3314221.3314652}, DOI={10.1145/3314221.3314652}, abstractNote={Convolutional Neural Networks (CNN) are widely used for Deep Learning tasks. CNN pruning is an important method to adapt a large CNN model trained on general datasets to fit a more specialized task or a smaller device. The key challenge is on deciding which filters to remove in order to maximize the quality of the pruned networks while satisfying the constraints. It is time-consuming due to the enormous configuration space and the slowness of CNN training. The problem has drawn many efforts from the machine learning field, which try to reduce the set of network configurations to explore. This work tackles the problem distinctively from a programming systems perspective, trying to speed up the evaluations of the remaining configurations through computation reuse via a compiler-based framework. We empirically uncover the existence of composability in the training of a collection of pruned CNN models, and point out the opportunities for computation reuse. We then propose composability-based CNN pruning, and design a compression-based algorithm to efficiently identify the set of CNN layers to pre-train for maximizing their reuse benefits in CNN pruning. We further develop a compiler-based framework named Wootz, which, for an arbitrary CNN, automatically generates code that builds a Teacher-Student scheme to materialize composability-based pruning. Experiments show that network pruning enabled by Wootz shortens the state-of-art pruning process by up to 186X while producing significantly better pruning results.}, booktitle={Proceedings of the 40th ACM SIGPLAN Conference on Programming Language Design and Implementation - PLDI 2019}, publisher={ACM Press}, author={Guan, Hui and Shen, Xipeng and Lim, Seung-Hwan}, year={2019} } @inproceedings{zhao_li_liao_shen_2018, title={Bridging the Gap between Deep Learning and Sparse Matrix Format Selection}, volume={53}, DOI={10.1145/3178487.3178495}, abstractNote={This work presents a systematic exploration on the promise and special challenges of deep learning for sparse matrix format selection---a problem of determining the best storage format for a matrix to maximize the performance of Sparse Matrix Vector Multiplication (SpMV). It describes how to effectively bridge the gap between deep learning and the special needs of the pillar HPC problem through a set of techniques on matrix representations, deep learning structure, and cross-architecture model migrations. The new solution cuts format selection errors by two thirds, and improves SpMV performance by 1.73X on average over the state of the art.}, number={1}, booktitle={ACM SIGPLAN NOTICES}, author={Zhao, Yue and Li, Jiajia and Liao, Chunhua and Shen, Xipeng}, year={2018}, pages={94–108} } @article{shen_lovas_liao_2018, title={Editorial for the Special Issue on In-Memory Computing}, volume={120}, ISSN={["1096-0848"]}, DOI={10.1016/j.jpdc.2018.05.009}, abstractNote={In this paper, we consider the uniform deployment problem of mobile agents in asynchronous unidirectional rings, which requires the agents to uniformly spread in the ring. The uniform deployment problem is in striking contrast to the rendezvous problem which requires the agents to meet at the same node. While rendezvous aims to break the symmetry, uniform deployment aims to attain the symmetry. It is well known that the symmetry breaking is difficult in distributed systems and the rendezvous problem cannot be solved from some initial configurations. Hence, we are interested in clarifying what difference the uniform deployment problem has on the solvability and the number of agent moves compared to the rendezvous problem. We consider two problem settings, with knowledge of k (or n) and without knowledge of k or n where k is the number of agents and n is the number of nodes. First, we consider agents with knowledge of k (or n since k andn can be easily obtained if one of them is given). In this case, we propose two algorithms. The first algorithm solves the uniform deployment problem with termination detection. This algorithm requires O(klogn) memory space per agent, O(n) time, and O(kn) total moves. The second algorithm also solves the uniform deployment problem with termination detection. This algorithm reduces the memory space per agent to O(logn), but uses O(nlogk) time, and requires O(kn) total moves. Both algorithms are asymptotically optimal in terms of total moves since there are some initial configurations such that agents require Ω(kn) total moves to solve the problem. Next, we consider agents with no knowledge of k or n. In this case, we show that, when termination detection is required, there exists no algorithm to solve the uniform deployment problem. For this reason, we consider the relaxed uniform deployment problem that does not require termination detection, and we propose an algorithm to solve the relaxed uniform deployment problem. This algorithm requires O((k∕l)log(n∕l)) memory space per agent, O(n∕l) time, and O(kn∕l) total moves when the initial configuration has symmetry degree l. This means that the algorithm can solve the problem more efficiently when the initial configuration has higher symmetric degree (i.e., is closer to uniform deployment). Note that all the proposed algorithms achieve uniform deployment from any initial configuration, which is a striking difference from the rendezvous problem because the rendezvous problem is not solvable from some initial configurations.}, journal={JOURNAL OF PARALLEL AND DISTRIBUTED COMPUTING}, author={Shen, Xipeng and Lovas, Robert and Liao, Xiaofei}, year={2018}, month={Oct}, pages={322–322} } @article{zhang_zhai_shen_mutlu_chen_2018, title={Efficient Document Analytics on Compressed Data: Method, Challenges, Algorithms, Insights}, volume={11}, ISSN={["2150-8097"]}, DOI={10.14778/3236187.3236203}, abstractNote={ Today's rapidly growing document volumes pose pressing challenges to modern document analytics, in both space usage and processing time. In this work, we propose the concept of compression-based direct processing to alleviate issues in both dimensions. The main idea is to enable direct document analytics on compressed data. We present how the concept can be materialized on Sequitur, a compression algorithm that produces hierarchical grammar-like representations. We discuss the major challenges in applying the idea to various document analytics tasks, and reveal a set of guidelines and also assistant software modules for developers to effectively apply compression-based direct processing . Experiments show that our proposed techniques save 90.8% storage space and 77.5% memory usage, while speeding up data processing significantly, i.e., by 1.6X on sequential systems, and 2.2X on distributed clusters, on average. }, number={11}, journal={PROCEEDINGS OF THE VLDB ENDOWMENT}, author={Zhang, Feng and Zhai, Jidong and Shen, Xipeng and Mutlu, Onur and Chen, Wenguang}, year={2018}, month={Jul}, pages={1522–1535} } @inproceedings{pittman_guan_shen_lim_patton_2018, title={Exploring Flexible Communications for Streamlining DNN Ensemble Training Pipelines}, ISBN={9781538683842}, url={http://dx.doi.org/10.1109/sc.2018.00067}, DOI={10.1109/sc.2018.00067}, abstractNote={Parallel training of a Deep Neural Network (DNN) ensemble on a cluster of nodes is a common practice to train multiple models in order to construct a model with a higher prediction accuracy, or to quickly tune the parameters of a training model. Existing ensemble training pipelines perform a great deal of redundant operations, resulting in unnecessary CPU usage, or even poor pipeline performance. In order to remove these redundancies, we need pipelines with more communication flexibility than existing DNN frameworks can provide. This project investigates a series of designs to improve pipeline flexibility and adaptivity, while also increasing performance. We implement our designs using Tensorflow with Horovod, and test it using several large DNNs in a large scale GPU cluster, the Titan supercomputer at Oak Ridge National Lab. Our results show that with the new flexible communication schemes, the CPU time spent during training is reduced by 2-11X. Furthermore, our implementation can achieve up to 10X speedups when CPU core limits are imposed. Our best pipeline also reduces the average power draw of the ensemble training process by 5-16% when compared to the baseline.}, booktitle={SC18: International Conference for High Performance Computing, Networking, Storage and Analysis}, publisher={IEEE}, author={Pittman, Randall and Guan, Hui and Shen, Xipeng and Lim, Seung-Hwan and Patton, Robert M.}, year={2018}, month={Nov} } @article{yang_shen_2018, title={FALCON: A Fast Drop-In Replacement of Citation KNN for Multiple Instance Learning}, DOI={10.1145/3269206.3271787}, abstractNote={Citation KNN is an important but compute-intensive algorithm for multiple instance learning (MIL). This paper presents FALCON, a fast replacement of Citation KNN. FALCON accelerates Citation KNN by removing unnecessary distance calculations through two novel optimizations, multi-level triangle inequality-based distance filtering and heap optimization. The careful design allows it to produce the same results as the original Citation KNN does while avoiding 84--99.8% distance calculations. On seven datasets of various sizes and dimensions, FALCON consistently outperforms Citation KNN by one or two orders of magnitude, making it a promising drop-in replacement of Citation KNN for multiple instance learning.}, journal={CIKM'18: PROCEEDINGS OF THE 27TH ACM INTERNATIONAL CONFERENCE ON INFORMATION AND KNOWLEDGE MANAGEMENT}, author={Yang, Shuai and Shen, Xipeng}, year={2018}, pages={67–76} } @article{luo_chen_liu_li_ding_shen_2018, title={Footprint Modeling of Cache Associativity and Granularity}, DOI={10.1145/3240302.3240419}, abstractNote={Two common techniques in efficient caching are associativity and sub-block granularity. This short paper presents a parameterized and composable model for each of the two techniques. It shows how the new models are more general, accurate or efficient than previous modeling solutions in either technique, and how they can be used together to model the cache implemented with both techniques, i.e. sub-block set associative cache.}, journal={PROCEEDINGS OF THE INTERNATIONAL SYMPOSIUM ON MEMORY SYSTEMS (MEMSYS 2018)}, author={Luo, Hao and Chen, Guoyang and Liu, Fangzhou and Li, Pengcheng and Ding, Chen and Shen, Xipeng}, year={2018}, pages={232–242} } @book{cohen_shen_torrellas_tuck_zhou_2018, title={Inter-Disciplinary Research Challenges in Computer Systems for the 2020s}, institution={National Science Foundation}, author={Cohen, A. and Shen, X. and Torrellas, J. and Tuck, James and Zhou, Yuanyuan}, year={2018}, month={Sep} } @article{ning_pittman_shen_2018, title={LCD: A Fast Contrastive Divergence Based Algorithm for Restricted Boltzmann Machine}, volume={108}, ISSN={["1879-2782"]}, DOI={10.1016/j.neunet.2018.08.018}, abstractNote={Restricted Boltzmann Machine (RBM) is the building block of Deep Belief Nets and other deep learning tools. Fast learning and prediction are both essential for practical usage of RBM-based machine learning techniques. This paper proposes Lean Contrastive Divergence (LCD), a modified Contrastive Divergence (CD) algorithm, to accelerate RBM learning and prediction without changing the results. LCD avoids most of the required computations with two optimization techniques. The first is called bounds-based filtering, which, through triangle inequality, replaces expensive calculations of many vector dot products with fast bounds calculations. The second is delta product, which effectively detects and avoids many repeated calculations in the core operation of RBM, Gibbs Sampling. The optimizations are applicable to both the standard contrastive divergence learning algorithm and its variations. In addition, this paper presents how to implement these optimizations effectively on massively parallel processors. Results show that the optimizations can produce several-fold (up to 3X for training and 5.3X for prediction) speedups.}, journal={NEURAL NETWORKS}, author={Ning, Lin and Pittman, Randall and Shen, Xipeng}, year={2018}, month={Dec}, pages={399–410} } @article{yang_shen_2018, title={LEEM: Lean Elastic EM for Gaussian Mixture Model via Bounds-Based Filtering}, ISSN={["1550-4786"]}, DOI={10.1109/ICDM.2018.00083}, abstractNote={Gaussian Mixture Model (GMM) is widely used in characterizing complicated real-world data and has played a crucial role in many pattern recognition problems. GMM is usually trained by Expectation Maximization algorithm (EM) which is computationally intensive. Previous studies have proposed a family of variants of EM. By considering only the data points that are the most important to a model in a GMM when updating that model, they help reduce some GMM training time. They are named Elastic EM in this paper. This work proposes several novel optimizations to further accelerate Elastic EM. These optimizations detect and avoid unnecessary probability calculations through novel bounds-based filtering at E-step as well as a Delta optimization to the M-step. Together, they create Lean Elastic EM (LEEM), which brings multi-fold speedups on six datasets of various sizes and dimensions.}, journal={2018 IEEE INTERNATIONAL CONFERENCE ON DATA MINING (ICDM)}, author={Yang, Shuai and Shen, Xipeng}, year={2018}, pages={677–686} } @article{zhao_zhou_shen_yiu_2018, title={Overhead-Conscious Format Selection for SpMV-Based Applications}, ISSN={["1530-2075"]}, DOI={10.1109/IPDPS.2018.00104}, abstractNote={Sparse matrix vector multiplication (SpMV) is an important kernel in many applications and is often the major performance bottleneck. The storage format of sparse matrices critically affects the performance of SpMV. Although there have been previous studies on selecting the appropriate format for a given matrix, they have ignored the influence of runtime prediction overhead and format conversion overhead. For many common uses of SpMV, such overhead is part of the execution times and may outweigh the benefits of new formats. Ignoring them makes the predictions from previous solutions frequently suboptimal and sometimes inferior. On the other hand, the overhead is difficult to consider, as it, along with the benefits of having a new format, varies from matrix to matrix, and from application to application. This work proposes a solution. It first explores the pros and cons of various possible treatments to the overhead in the format selection problem. It then presents an explicit approach which involves several regression models for capturing the influence of the overhead and benefits of format conversions on the overall program performance. It proposes a two-stage lazy-and-light scheme to help control the risks in the format predictions and at the same time maximize the overall format conversion benefits. Experiments show that the technique outperforms previous techniques significantly. It improves the overall performance of applications by 1.14X to 1.43X, significantly larger than the 0.82X to 1.24X upperbound speedups overhead-oblivious methods could give.}, journal={2018 32ND IEEE INTERNATIONAL PARALLEL AND DISTRIBUTED PROCESSING SYMPOSIUM (IPDPS)}, author={Zhao, Yue and Zhou, Weijie and Shen, Xipeng and Yiu, Graham}, year={2018}, pages={950–959} } @article{zhu_wu_shen_shen_shen_wang_2018, title={Resolving the GPU responsiveness dilemma through program transformations}, volume={12}, ISSN={2095-2228 2095-2236}, url={http://dx.doi.org/10.1007/s11704-016-6206-y}, DOI={10.1007/s11704-016-6206-y}, number={3}, journal={Frontiers of Computer Science}, publisher={Springer Science and Business Media LLC}, author={Zhu, Qi and Wu, Bo and Shen, Xipeng and Shen, Kai and Shen, Li and Wang, Zhiying}, year={2018}, month={Feb}, pages={545–559} } @article{shen_2018, title={Rethinking Compilers in the Rise of Machine Learning and AI}, DOI={10.1145/3178372.3183634}, abstractNote={Recent years have witnessed some influential progresses in Machine Learning (ML) and Artificial Intelligence (AI). The progresses may lead to some significant changes to future programming. Many programs, for instance, may be not code written in some specially designed programming languages, but high-level user intentions expressed in natural languages. Deep Learning-based software, despite the difficulties in interpreting their results, may continue its rapid growth in the software market and its influence in people's everyday life. This talk will first examine the implications of these changes to compiler research, and then discuss the potential opportunities that ML and AI could bring to possibly transform the field of compiler research. Specifically, the talk will focus on the possibilities for ML and AI to help reveal the high-level semantics and attributes of software components that traditional compiler technology cannot do, and hence, open important opportunities for high-level large-scoped code reasoning and optimizations---a direction that has some tremendous potential but has been beyond the reach of traditional compiler technology. The talk will discuss how ML and AI may help break the "abstraction wall"---barriers formed by layers of abstractions in modern software---for program analysis and optimizations, and how ML and AI may transform the way in which high-level user intentions get translated into low-level code implementations. The talk will conclude with a list of grand challenges and possible research directions for future compiler constructions.}, journal={CC'18: PROCEEDINGS OF THE 27TH INTERNATIONAL CONFERENCE ON COMPILER CONSTRUCTION}, author={Shen, Xipeng}, year={2018}, pages={1–1} } @article{guan_ding_shen_krim_2018, title={Reuse-Centric K-Means Configuration}, ISSN={["1084-4627"]}, DOI={10.1109/ICDE.2018.00116}, abstractNote={K-means configuration is a time-consuming process due to the iterative nature of k-means. This paper proposes reuse-centric k-means configuration to accelerate k-means configuration. It is based on the observation that the explorations of different configurations share lots of common or similar computations. Effectively reusing the computations from prior trials of different configurations could largely shorten the configuration time. The paper presents a set of novel techniques to materialize the idea, including reuse-based filtering, center reuse, and a two-phase design to capitalize on the reuse opportunities on three levels: validation, k, and feature sets. Experiments show that our approach can accelerate some common configuration tuning methods by 5-9X.}, journal={2018 IEEE 34TH INTERNATIONAL CONFERENCE ON DATA ENGINEERING (ICDE)}, author={Guan, Hui and Ding, Yufei and Shen, Xipeng and Krim, Hamid}, year={2018}, pages={1224–1227} } @article{xu_xu_xue_shen_zheng_huang_yang_2018, title={Taming the "Monster": Overcoming Program Optimization Challenges on SW26010 Through Precise Performance Modeling}, ISSN={["1530-2075"]}, DOI={10.1109/IPDPS.2018.00086}, abstractNote={This paper presents an effort for overcoming the complexities of program optimizations on SW26010, the heterogeneous many-core processor that powers Sunway TaihuLight, the world top one supercomputer. The solution centers around a precise, static performance model for modern many-core processor. Through a careful design that leverages the special properties of SW26010 and an effective treatment to massive parallelism, the model achieves a high accuracy, showing less than 5% average errors in estimating program execution performance. The precise performance model opens many opportunities for analyzing and guiding code optimizations. The paper demonstrates the usefulness by revealing a series of insights on the effects of some important code optimizations on SW26010. Moreover, it demonstrates that with such a precise performance model, it is feasible to replace empirical auto-tuning with static auto-tuning for optimizing regular loops on heterogeneous many-core systems. Such a replacement speeds up the tuning process by as much as a factor of 43 while keeping the tuning quality loss below 6%.}, journal={2018 32ND IEEE INTERNATIONAL PARALLEL AND DISTRIBUTED PROCESSING SYMPOSIUM (IPDPS)}, author={Xu, Shizhen and Xu, Yuanchao and Xue, Wei and Shen, Xipeng and Zheng, Fang and Huang, Xiaomeng and Yang, Guangwen}, year={2018}, pages={763–773} } @inproceedings{zhang_zhai_shen_mutlu_chen_2018, title={Zwift}, ISBN={9781450357838}, url={http://dx.doi.org/10.1145/3205289.3205325}, DOI={10.1145/3205289.3205325}, abstractNote={Today's rapidly growing document volumes pose pressing challenges to modern document analytics frameworks, in both space usage and processing time. Recently, a promising method, called text analytics directly on compressed data (TADOC), was proposed for improving both the time and space efficiency of text analytics. The main idea of the technique is to enable direct document analytics on compressed data. This paper focuses on the programming challenges for developing efficient TADOC programs. It presents Zwift, the first programming framework for TADOC, which consists of a Domain Specific Language, a compiler and runtime, and a utility library. Experiments show that Zwift significantly improves programming productivity, while effectively unleashing the power of TADOC, producing code that reduces storage usage by 90.8% and execution time by 41.0% on six text analytics problems.}, booktitle={Proceedings of the 2018 International Conference on Supercomputing - ICS '18}, publisher={ACM Press}, author={Zhang, Feng and Zhai, Jidong and Shen, Xipeng and Mutlu, Onur and Chen, Wenguang}, year={2018} } @inproceedings{zhao_liao_shen_2017, title={An infrastructure for HPC knowledge sharing and reuse}, volume={52}, DOI={10.1145/3155284.3019023}, abstractNote={This paper presents a prototype infrastructure for addressing the barriers for effective accumulation, sharing, and reuse of the various types of knowledge for high performance parallel computing.}, number={8}, booktitle={ACM SIGPLAN Notices}, author={Zhao, Y. and Liao, C. H. and Shen, Xipeng}, year={2017}, pages={461–462} } @inproceedings{shen_2017, title={Bridging the gap between memory performance and massive parallelism: The critical role of programming systems innovations (keynote)}, volume={52}, DOI={10.1145/3156685.3092569}, abstractNote={This talk examines some trends in the modern developments of memory systems and their relations with the massive parallelism in processors and applications. It then draws on some recent work on GPU to explain the important role of programming systems in bridging the gap; it particularly emphasizes the importance of innovations for enabling better software controllability, more software elasticity, and inter-thread data locality enhancements. The talk further discusses the implications brought to programming systems by the increasingly blurred boundaries among memory, storage, and processing.}, number={9}, booktitle={ACM SIGPLAN Notices}, author={Shen, Xipeng}, year={2017}, pages={1–1} } @article{zhu_wo_shen_shen_wang_2017, title={Co-Run Scheduling with Power Cap on Integrated CPU-GPU Systems}, ISSN={["1530-2075"]}, DOI={10.1109/ipdps.2017.124}, abstractNote={This paper presents the first systematic study on co-scheduling independent jobs on integrated CPU-GPU systems with power caps considered. It reveals the performance degradations caused by the co-run contentions at the levels of both memory and power. It then examines the problem of using job co-scheduling to alleviate the degradations in this less understood scenario. It offers several algorithms and a lightweight co-run performance and power predictive model for computing the performance bounds of the optimal co-schedules and finding appropriate schedules. Results show that the method can efficiently find co-schedules that significantly improve the system throughput (9-46% on average over the default schedules).}, journal={2017 31ST IEEE INTERNATIONAL PARALLEL AND DISTRIBUTED PROCESSING SYMPOSIUM (IPDPS)}, author={Zhu, Qi and Wo, Bo and Shen, Xipeng and Shen, Li and Wang, Zhiying}, year={2017}, pages={967–977} } @inbook{shen_wu_2017, title={Data placement on GPUs}, ISBN={9780128037386}, url={http://dx.doi.org/10.1016/b978-0-12-803738-6.00005-7}, DOI={10.1016/b978-0-12-803738-6.00005-7}, abstractNote={Modern graphics processing unit (GPU) memory systems consist of a number of components with different properties. How data are placed on the various memory components often causes a significant difference in a program’s performance. This chapter discusses the complexity of GPU memory systems and describes a software framework named PORPLE to show how to automatically resolve the complexity for a given GPU program.}, booktitle={Advances in GPU Research and Practice}, publisher={Elsevier}, author={Shen, X. and Wu, B.}, year={2017}, pages={105–123} } @inproceedings{chen_zhao_shen_zhou_2017, title={EffiSha: A software framework for enabling efficient preemptive scheduling of GPU}, volume={52}, DOI={10.1145/3155284.3018748}, abstractNote={Modern GPUs are broadly adopted in many multitasking environments, including data centers and smartphones. However, the current support for the scheduling of multiple GPU kernels (from different applications) is limited, forming a major barrier for GPU to meet many practical needs. This work for the first time demonstrates that on existing GPUs, efficient preemptive scheduling of GPU kernels is possible even without special hardware support. Specifically, it presents EffiSha, a pure software framework that enables preemptive scheduling of GPU kernels with very low overhead. The enabled preemptive scheduler offers flexible support of kernels of different priorities, and demonstrates significant potential for reducing the average turnaround time and improving the system overall throughput of programs that time share a modern GPU.}, number={8}, booktitle={ACM SIGPLAN Notices}, author={Chen, G. Y. and Zhao, Y. and Shen, Xipeng and Zhou, H. Y.}, year={2017}, pages={3–16} } @inproceedings{chen_zhang_budhiraja_shen_wu_2017, title={Efficient support of position independence on non-volatile memory}, ISBN={9781450349529}, url={http://dx.doi.org/10.1145/3123939.3124543}, DOI={10.1145/3123939.3124543}, abstractNote={This paper explores solutions for enabling efficient supports of position independence of pointer-based data structures on byte-addressable None-Volatile Memory (NVM). When a dynamic data structure (e.g., a linked list) gets loaded from persistent storage into main memory in different executions, the locations of the elements contained in the data structure could differ in the address spaces from one run to another. As a result, some special support must be provided to ensure that the pointers contained in the data structures always point to the correct locations, which is called position independence. This paper shows the insufficiency of traditional methods in supporting position independence on NVM. It proposes a concept called implicit self-contained representations of pointers, and develops two such representations named off-holder and Region ID in Value (RIV) to materialize the concept. Experiments show that the enabled representations provide much more efficient and flexible support of position independence for dynamic data structures, alleviating a major issue for effective data reuses on NVM. CCS CONCEPTS • Hardware → Memory and dense storage; • Computer systems organization → Architectures; • Software and its engineering → Compilers; General programming languages;}, booktitle={Proceedings of the 50th Annual IEEE/ACM International Symposium on Microarchitecture - MICRO-50 '17}, publisher={ACM Press}, author={Chen, Guoyang and Zhang, Lei and Budhiraja, Richa and Shen, Xipeng and Wu, Youfeng}, year={2017} } @inproceedings{guan_shen_krim_2017, title={Egeria}, ISBN={9781450351140}, url={http://dx.doi.org/10.1145/3126908.3126961}, DOI={10.1145/3126908.3126961}, abstractNote={Achieving high performance on modern systems is challenging. Even with a detailed profile from a performance tool, writing or refactoring a program to remove its performance issues is still a daunting task for application programmers: it demands lots of program optimization expertise that is often system specific. Vendors often provide some detailed optimization guides to assist programmers in the process. However, these guides are frequently hundreds of pages long, making it difficult for application programmers to master and memorize all the rules and guidelines and properly apply them to a specific problem instance. In this work, we develop a framework named Egeria to alleviate the difficulty. Through Egeria, one can easily construct an advising tool for a certain high performance computing (HPC) domain (e.g., GPU programming) by providing Egeria with a optimization guide or other related documents for the target domain. An advising tool produced by Egeria provides a concise list of essential rules automatically extracted from the documents. At the same time, the advising tool serves as a question-answer agent that can interactively offers suggestions for specific optimization questions. Egeria is made possible through a distinctive multi-layered design that leverages natural language processing techniques and extends them with knowledge of HPC domains and how to extract information relevant to code optimization Experiments on CUDA, OpenCL, and Xeon Phi programming guides demonstrate, both qualitatively and quantitatively, the usefulness of Egeria for HPC. CCS CONCEPTS • General and reference → Performance; • Computing methodologies → Natural language processing;}, booktitle={Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis on - SC '17}, publisher={ACM Press}, author={Guan, Hui and Shen, Xipeng and Krim, Hamid}, year={2017} } @article{ding_shen_2017, title={GLORE: generalized loop redundancy elimination upon LER-notation}, volume={1}, ISSN={2475-1421}, url={http://dx.doi.org/10.1145/3133898}, DOI={10.1145/3133898}, abstractNote={This paper presents GLORE, a novel approach to enabling the detection and removal of large-scoped redundant computations in nested loops. GLORE works on LER-notation, a new representation of computations in both regular and irregular loops. Together with a set of novel algorithms, it makes GLORE able to systematically consider computation reordering at both the expression level and the loop level in a unified manner. GLORE shows an applicability much broader than prior methods have, and frequently lowers the computational complexities of some nested loops that are elusive to prior optimization techniques, producing significantly larger speedups.}, number={OOPSLA}, journal={Proceedings of the ACM on Programming Languages}, publisher={Association for Computing Machinery (ACM)}, author={Ding, Yufei and Shen, Xipeng}, year={2017}, month={Oct}, pages={1–28} } @inproceedings{ding_ning_guan_shen_2017, title={Generalizations of the theory and deployment of triangular inequality for compiler-based strength reduction}, volume={52}, DOI={10.1145/3140587.3062377}, abstractNote={Triangular Inequality (TI) has been used in many manual algorithm designs to achieve good efficiency in solving some distance calculation-based problems. This paper presents our generalization of the idea into a compiler optimization technique, named TI-based strength reduction. The generalization consists of three parts. The first is the establishment of the theoretic foundation of this new optimization via the development of a new form of TI named Angular Triangular Inequality, along with several fundamental theorems. The second is the revealing of the properties of the new forms of TI and the proposal of guided TI adaptation, a systematic method to address the difficulties in effective deployments of TI optimizations. The third is an integration of the new optimization technique in an open-source compiler. Experiments on a set of data mining and machine learning algorithms show that the new technique can speed up the standard implementations by as much as 134X and 46X on average for distance-related problems, outperforming previous TI-based optimizations by 2.35X on average. It also extends the applicability of TI-based optimizations to vector related problems, producing tens of times of speedup.}, number={6}, booktitle={ACM SIGPLAN Notices}, author={Ding, Y. F. and Ning, L. and Guan, H. and Shen, Xipeng}, year={2017}, pages={33–48} } @article{ning_pittman_shen_2017, title={LCD: A Fast Contrastive Divergence Based Algorithm for Restricted Boltzmann Machine}, ISSN={["1550-4786"]}, DOI={10.1109/icdm.2017.131}, abstractNote={Restricted Boltzmann Machine (RBM) is the building block of Deep Belief Nets and other deep learning tools. Fast learning and prediction are both essential for practical usage of RBM-based machine learning techniques. This paper proposes Lean Contrastive Divergence (LCD), a modified Contrastive Divergence (CD) algorithm, to accelerate RBM learning and prediction without changing the results. LCD avoids most of the required computations with two optimization techniques. The first is called bounds-based filtering, which, through triangle inequality, replaces expensive calculations of many vector dot products with fast bounds calculations. The second is delta product, which effectively detects and avoids many repeated calculations in the core operation of RBM, Gibbs Sampling. The optimizations are applicable to both the standard contrastive divergence learning algorithm and its variations. Results show that the optimizations can produce several-fold (up to 3X for training and 5.3X for prediction) speedups.}, journal={2017 17TH IEEE INTERNATIONAL CONFERENCE ON DATA MINING (ICDM)}, author={Ning, Lin and Pittman, Randall and Shen, Xipeng}, year={2017}, pages={1015–1020} } @article{chen_shen_wu_li_2017, title={Optimizing Data Placement on GPU Memory: A Portable Approach}, volume={66}, ISSN={0018-9340}, url={http://dx.doi.org/10.1109/TC.2016.2604372}, DOI={10.1109/tc.2016.2604372}, abstractNote={Modern GPUs feature complex memory system designs. One GPU may contain many types of memory of different properties. The best way to place data in memory is sensitive to many factors (e.g., program inputs, architectures), making portable optimizations of GPU data placement a difficult challenge. PORPLE is a recently proposed method that overcomes the difficulties by enabling online optimizations of data placement through a three-way synergy: a specification language for memory system description, a compiler framework for data access analysis and code staging, and a runtime library for efficiently finding and materializing data placement on the fly. This article provides a comprehensive description of this method, and presents several extensions that significantly improve the scalability of PORPLE, which include a novel algorithm design for efficiently searching for the best data placements, the use of active profiling for reducing the online-profiling overhead, and a systematic examination of a path-based performance model. By automatically tailoring data placements for each execution of a GPU program, the enhanced PORPLE brings significant speedups (1.72X on average) to many GPU kernels across GPU architectures and program inputs.}, number={3}, journal={IEEE Transactions on Computers}, publisher={Institute of Electrical and Electronics Engineers (IEEE)}, author={Chen, Guoyang and Shen, Xipeng and Wu, Bo and Li, Dong}, year={2017}, month={Mar}, pages={473–487} } @inbook{wu_shen_2017, title={Software-level task scheduling on GPUs}, ISBN={9780128037386}, url={http://dx.doi.org/10.1016/b978-0-12-803738-6.00004-5}, DOI={10.1016/b978-0-12-803738-6.00004-5}, abstractNote={Task scheduling is critical for exploiting the full power of multicore processors such as graphics processing units (GPUs). Unlike central processing units, GPUs do not provide necessary APIs for the programmers or compilers to use to control scheduling. On the contrary, the hardware scheduler on modern GPUs makes flexible task scheduling difficult. This chapter presents a compiler and runtime framework with the capability to automatically transform and optimize GPU programs to enable controllable task scheduling to the streaming multiprocessors (SMs). At the center of the framework is SM-centric transformation, which addresses the complexities of the hardware scheduler and provides the scheduling capability. The framework presents many opportunities for new optimizations, of which this chapter presents three: parallelism, locality, and processor partitioning. We show in extensive experiments that the optimizations can substantially improve the performance of a set of GPU programs in multiple scenarios.}, booktitle={Advances in GPU Research and Practice}, publisher={Elsevier}, author={Wu, B. and Shen, X.}, year={2017}, pages={83–103} } @article{chen_ding_shen_2017, title={Sweet KNN: An Efficient KNN on GPU through Reconciliation between Redundancy Removal and Regularity}, ISSN={["1084-4627"]}, DOI={10.1109/icde.2017.116}, abstractNote={Finding the k nearest neighbors of a query point or a set of query points (KNN) is a fundamental problem in many application domains. It is expensive to do. Prior efforts in improving its speed have followed two directions with conflicting considerations: One tries to minimize the redundant distance computations but often introduces irregularities into computations, the other tries to exploit the regularity in computations to best exert the power of GPU-like massively parallel processors, which often introduces even extra distance computations. This work gives a detailed study on how to effectively combine the strengths of both approaches. It manages to reconcile the polar opposite effects of the two directions through elastic algorithmic designs, adaptive runtime configurations, and a set of careful implementation-level optimizations. The efforts finally lead to a new KNN on GPU named Sweet KNN, the first high-performance triangular-inequality-based KNN on GPU that manages to reach a sweet point between redundancy minimization and regularity preservation for various datasets. Experiments on a set of datasets show that Sweet KNN outperforms existing GPU implementations on KNN by up to 120X (11X on average).}, journal={2017 IEEE 33RD INTERNATIONAL CONFERENCE ON DATA ENGINEERING (ICDE 2017)}, author={Chen, Guoyang and Ding, Yufei and Shen, Xipeng}, year={2017}, pages={621–632} } @article{zhu_wu_shen_shen_shen_wang_2017, title={Understanding co-run performance on CPU-GPU integrated processors: observations, insights, directions}, volume={11}, ISSN={2095-2228 2095-2236}, url={http://dx.doi.org/10.1007/s11704-016-5468-8}, DOI={10.1007/s11704-016-5468-8}, number={1}, journal={Frontiers of Computer Science}, publisher={Springer Science and Business Media LLC}, author={Zhu, Qi and Wu, Bo and Shen, Xipeng and Shen, Kai and Shen, Li and Wang, Zhiying}, year={2017}, month={Feb}, pages={130–146} } @inproceedings{zheng_oh_zhai_shen_yi_chen_2017, title={Versapipe}, ISBN={9781450349529}, url={http://dx.doi.org/10.1145/3123939.3123978}, DOI={10.1145/3123939.3123978}, abstractNote={Pipeline is an important programming pattern, while GPU, designed mostly for data-level parallel executions, lacks an efficient mechanism to support pipeline programming and executions. This paper provides a systematic examination of various existing pipeline execution models on GPU, and analyzes their strengths and weaknesses. To address their shortcomings, this paper then proposes three new execution models equipped with much improved controllability, including a hybrid model that is capable of getting the strengths of all. These insights ultimately lead to the development of a software programming framework named VersaPipe. With VersaPipe, users only need to write the operations for each pipeline stage. VersaPipe will then automatically assemble the stages into a hybrid execution model and configure it to achieve the best performance. Experiments on a set of pipeline benchmarks and a real-world face detection application show that VersaPipe produces up to $6.90 \times (2.88 \times$ on average) speedups over the original manual implementations. CCS CONCEPTS • General and reference $\rightarrow$ Performance; • Computing methodologies $\rightarrow$ Parallel computing methodologies; • Computer systems organization $\rightarrow$ Heterogeneous (hybrid) systems;}, booktitle={Proceedings of the 50th Annual IEEE/ACM International Symposium on Microarchitecture - MICRO-50 '17}, publisher={ACM Press}, author={Zheng, Zhen and Oh, Chanyoung and Zhai, Jidong and Shen, Xipeng and Yi, Youngmin and Chen, Wenguang}, year={2017} } @book{chen_shen_zhou_2016, title={A Software Framework for Efficient Preemptive Scheduling on GPU}, number={TR-2016-1}, institution={North Carolina State University}, author={Chen, Guoyang and Shen, Xipeng and Zhou, Huiyang}, year={2016}, month={Jan} } @inproceedings{chen_shen_2016, title={Coherence-Free Multiview}, ISBN={9781450343619}, url={http://dx.doi.org/10.1145/2925426.2926277}, DOI={10.1145/2925426.2926277}, abstractNote={A Graphic Processing Unit (GPU) system is typically equipped with many types of memory (e.g., global, constant, texture, shared, cache). Data placement determines what data are placed on which type of memory, essential for GPU memory performance. Prior optimizations of data placement always require a single view of a data object on memory, which limits the optimization effectiveness. In this work, we propose coherence-free multiview, an approach that allows multiple views of a single data object to co-exist on GPU memory during a GPU kernel execution. We demonstrate that under certain conditions, the multiple views can remain incoherent while facilitating enhanced data placement. We present a theorem and some compiler support to ensure the soundness of the usage of coherence-free multiview. We further develop reference-discerning data placement, a new way to enhance data placements on GPU. It enables more flexible data placements by using coherence-free multiview to leverage the slack in coherence requirement of some GPU programs. Experiments on three types of GPU systems show that, with less than 200KB space cost, the new data placement technique can provide a 1.6X average (up to 4.27X) speedup.}, booktitle={Proceedings of the 2016 International Conference on Supercomputing - ICS '16}, publisher={ACM Press}, author={Chen, Guoyang and Shen, Xipeng}, year={2016} } @article{zhou_wu_shen_gao_yiu_2016, title={Examining and Reducing the Influence of Sampling Errors on Feedback-Driven Optimizations}, volume={13}, ISSN={["1544-3973"]}, DOI={10.1145/2851502}, abstractNote={Feedback-driven optimization (FDO) is an important component in mainstream compilers. By allowing the compiler to reoptimize the program based on some profiles of the program's dynamic behaviors, it often enhances the quality of the generated code substantially. A barrier for using FDO is that it often requires many training runs to collect enough profiles to amortize the sensitivity of program optimizations to program input changes. Various sampling techniques have been explored to alleviate this time-consuming process. However, the lowered profile accuracy caused by sampling often hurts the benefits of FDO.}, number={1}, journal={ACM TRANSACTIONS ON ARCHITECTURE AND CODE OPTIMIZATION}, author={Zhou, Mingzhou and Wu, Bo and Shen, Xipeng and Gao, Yaoqing and Yiu, Graham}, year={2016}, month={Apr} } @book{ning_shen_2016, place={Raleigh, NC}, title={LCD: A Fast Contrastive Divergence Based Training Algorithm for Restricted Boltzmann Machine”}, number={TR-2016-3}, institution={North Carolina State University}, author={Ning, Lin and Shen, Xipeng}, year={2016}, month={Apr} } @book{shen_mueller_tuck_2016, title={Languages and Compilers for Parallel Computing}, ISBN={9783319297774 9783319297781}, ISSN={0302-9743 1611-3349}, url={http://dx.doi.org/10.1007/978-3-319-29778-1}, DOI={10.1007/978-3-319-29778-1}, abstractNote={This book constitutes the thoroughly refereed post-conference proceedings of the 28th International Workshop on Languages and Compilers for Parallel Computing, LCPC 2015, held in Raleigh, NC, USA, in}, journal={Lecture Notes in Computer Science}, publisher={Springer International Publishing}, year={2016} } @article{chen_zhou_shen_gahm_venkat_booth_marshall_2016, title={OpenCL-based erasure coding on heterogeneous architectures}, volume={7}, DOI={10.1109/asap.2016.7760770}, abstractNote={Erasure coding, Reed-Solomon coding in particular, is a key technique to deal with failures in scale-out storage systems. However, due to the algorithmic complexity, the performance overhead of erasure coding can become a significant bottleneck in storage systems attempting to meet service level agreements (SLAs). Previous work has mainly leveraged SIMD (single-instruction multiple-data) instruction extensions in general purpose processors to improve the processing throughput. In this work, we exploit state-of-art heterogeneous architectures, including GPUs, APUs, and FPGAs, to accelerate erasure coding. We leverage the OpenCL framework for our target heterogeneous architectures and propose code optimizations for each target architecture. Given their different hardware characteristics, we highlight the different optimization strategies for each of the target architectures. Using the throughput metric as the ratio of the input file size over the processing latency, we achieve 2.84 GB/s on a 28-core Xeon CPU, 3.90 GB/s on an NVIDIA K40m GPU, 0.56 GB/s on an AMD Carrizo APU, and 1.19 GB/s (5.35 GB/s if only considering the kernel execution latency) on an Altera Stratix V FPGA, when processing a 836.9MB zipped file with a 30×33 encoding matrix. In comparison, the single-thread code using the Intel's ISA-L library running on the Xeon CPU has the throughput of 0.13 GB/s.}, journal={2016 IEEE 27th International Conference on Application-specific Systems, Architectures and Processors (ASAP)}, publisher={IEEE}, author={Chen, Guoyang and Zhou, Huiyang and Shen, Xipeng and Gahm, Josh and Venkat, Narayan and Booth, Skip and Marshall, John}, year={2016}, month={Jul}, pages={33–40} } @inproceedings{zhao_chen_liao_shen_2016, place={Dagstuhl, Germany}, series={Leibniz International Proceedings in Informatics (LIPIcs)}, title={Towards Ontology-Based Program Analysis}, booktitle={30th European Conference on Object-Oriented Programming (ECOOP 2016)}, publisher={Schloss Dagstuhl--Leibniz-Zentrum fuer Informatik}, author={Zhao, Yue and Chen, Guoyang and Liao, Chunhua and Shen, Xipeng}, editor={Krishnamurthi, Shriram and Lerner, Benjamin S.Editors}, year={2016}, pages={26:1–26:25}, collection={Leibniz International Proceedings in Informatics (LIPIcs)} } @book{zhao_liao_shen_2016, title={Towards Ontology-Based Program Analysis}, number={TR-2016-5}, institution={North Carolina State University}, author={Zhao, Y. and Liao, C. and Shen, X.}, year={2016}, month={Apr} } @article{fu_menzies_shen_2016, title={Tuning for software analytics: Is it really necessary?}, volume={76}, ISSN={0950-5849}, url={http://dx.doi.org/10.1016/j.infsof.2016.04.017}, DOI={10.1016/j.infsof.2016.04.017}, abstractNote={Context: Data miners have been widely used in software engineering to, say, generate defect predictors from static code measures. Such static code defect predictors perform well compared to manual methods, and they are easy to use and useful to use. But one of the “black arts” of data mining is setting the tunings that control the miner. Objective: We seek simple, automatic, and very effective method for finding those tunings. Method: For each experiment with different data sets (from open source JAVA systems), we ran differential evolution as an optimizer to explore the tuning space (as a first step) then tested the tunings using hold-out data. Results: Contrary to our prior expectations, we found these tunings were remarkably simple: it only required tens, not thousands, of attempts to obtain very good results. For example, when learning software defect predictors, this method can quickly find tunings that alter detection precision from 0% to 60%. Conclusion: Since (1) the improvements are so large, and (2) the tuning is so simple, we need to change standard methods in software analytics. At least for defect prediction, it is no longer enough to just run a data miner and present the result without conducting a tuning optimization study. The implication for other kinds of analytics is now an open and pressing issue.}, journal={Information and Software Technology}, publisher={Elsevier BV}, author={Fu, Wei and Menzies, Tim and Shen, Xipeng}, year={2016}, month={Aug}, pages={135–146} } @article{ding_ansel_veeramachaneni_shen_o'reilly_amarasinghe_2015, title={Autotuning Algorithmic Choice for Input Sensitivity}, volume={50}, ISSN={["1558-1160"]}, DOI={10.1145/2813885.2737969}, abstractNote={A daunting challenge faced by program performance autotuning is input sensitivity, where the best autotuned configuration may vary with different input sets. This paper presents a novel two-level input learning algorithm to tackle the challenge for an important class of autotuning problems, algorithmic autotuning. The new approach uses a two-level input clustering method to automatically refine input grouping, feature selection, and classifier construction. Its design solves a series of open issues that are particularly essential to algorithmic autotuning, including the enormous optimization space, complex influence by deep input features, high cost in feature extraction, and variable accuracy of algorithmic choices. Experimental results show that the new solution yields up to a 3x speedup over using a single configuration for all inputs, and a 34x speedup over a traditional one-level method for addressing input sensitivity in program optimizations.}, number={6}, journal={ACM SIGPLAN NOTICES}, author={Ding, Yufei and Ansel, Jason and Veeramachaneni, Kalyan and Shen, Xipeng and O'Reilly, Una-May and Amarasinghe, Saman}, year={2015}, month={Jun}, pages={379–390} } @article{chen_wu_li_shen_2015, title={Enabling Portable Optimizations of Data Placement on GPU}, volume={35}, ISSN={0272-1732 1937-4143}, url={http://dx.doi.org/10.1109/MM.2015.53}, DOI={10.1109/mm.2015.53}, abstractNote={Modern GPU memory systems manifest more varieties, increasing complexities, and rapid changes. Different placements of data on memory systems often cause significant differences in program performance. Most current GPU programming systems rely on programmers to indicate the appropriate placements, but finding the appropriate placements is difficult for programmers in practice owing to the complexity and fast changes of memory systems, as well as the input sensitivity of appropriate data placements--that is, the best placements often differ when a program runs on a different input data set. This article introduces a software framework called Porple. It offers a solution that, for the first time, makes it possible to automatically enhance data placement across a GPU. Through Porple, a GPU program's data gets placed appropriately on memory on the fly, customized to the current input dataset. Moreover, when new memory systems arrive, it can easily adapt the placements accordingly. Experiments on three types of GPU systems show that Porple consistently finds optimal or near-optimal placement, yielding up to 2.93 times (1.75 times average on three generations of GPU) speedups compared to programmers' decisions.}, number={4}, journal={IEEE Micro}, publisher={Institute of Electrical and Electronics Engineers (IEEE)}, author={Chen, Guoyang and Wu, Bo and Li, Dong and Shen, Xipeng}, year={2015}, month={Jul}, pages={16–24} } @inproceedings{wu_chen_li_shen_vetter_2015, title={Enabling and Exploiting Flexible Task Assignment on GPU through SM-Centric Program Transformations}, ISBN={9781450335591}, url={http://dx.doi.org/10.1145/2751205.2751213}, DOI={10.1145/2751205.2751213}, abstractNote={A GPU's computing power lies in its abundant memory bandwidth and massive parallelism. However, its hardware thread schedulers, despite being able to quickly distribute computation to processors, often fail to capitalize on program characteristics effectively, achieving only a fraction of the GPU's full potential. Moreover, current GPUs do not allow programmers or compilers to control this thread scheduling, forfeiting important optimization opportunities at the program level. This paper presents a transformation centered on Streaming Multiprocessors (SM); this software approach to circumventing the limitations of the hardware scheduler allows flexible program-level control of scheduling. By permitting precise control of job locality on SMs, the transformation overcomes inherent limitations in prior methods. With this technique, flexible control of GPU scheduling at the program level becomes feasible, which opens up new opportunities for GPU program optimizations. The second part of the paper explores how the new opportunities could be leveraged for GPU performance enhancement, what complexities there are, and how to address them. We show that some simple optimization techniques can enhance co-runs of multiple kernels and improve data locality of irregular applications, producing 20-33% average increase in performance, system throughput, and average turnaround time.}, booktitle={Proceedings of the 29th ACM on International Conference on Supercomputing - ICS '15}, publisher={ACM Press}, author={Wu, Bo and Chen, Guoyang and Li, Dong and Shen, Xipeng and Vetter, Jeffrey}, year={2015} } @inproceedings{chen_shen_2015, title={Free launch}, ISBN={9781450340342}, url={http://dx.doi.org/10.1145/2830772.2830818}, DOI={10.1145/2830772.2830818}, abstractNote={Supporting dynamic parallelism is important for GPU to benefit a broad range of applications. There are currently two fundamental ways for programs to exploit dynamic parallelism on GPU: a software-based approach with software-managed worklists, and a hardware-based approach through dynamic subkernel launches. Neither is satisfactory. The former is complicated to program and is often subject to some load imbalance; the latter suffers large runtime overhead. In this work, we propose free launch, a new software approach to overcoming the shortcomings of both methods. It allows programmers to use subkernel launches to express dynamic parallelism. It employs a novel compiler-based code transformation named subkernel launch removal to replace the subkernel launches with the reuse of parent threads. Coupled with an adaptive task assignment mechanism, the transformation reassigns the tasks in the subkernels to the parent threads with a good load balance. The technique requires no hardware extensions, immediately deployable on existing GPUs. It keeps the programming convenience of the subkernel launch-based approach while avoiding its large runtime overhead. Meanwhile, its superior load balancing makes it outperform manual worklist-based techniques by 3X on average.}, booktitle={Proceedings of the 48th International Symposium on Microarchitecture - MICRO-48}, publisher={ACM Press}, author={Chen, Guoyang and Shen, Xipeng}, year={2015} } @article{zhao_shen_2015, title={On-the-Fly Principled Speculation for FSM Parallelization}, volume={50}, ISSN={["1558-1160"]}, DOI={10.1145/2775054.2694369}, abstractNote={Finite State Machine (FSM) is the backbone of an important class of applications in many domains. Its parallelization has been extremely difficult due to inherent strong dependences in the computation. Recently, principled speculation shows good promise to solve the problem. However, the reliance on offline training makes the approach inconvenient to adopt and hard to apply to many practical FSM applications, which often deal with a large variety of inputs different from training inputs. This work presents an assembly of techniques that completely remove the needs for offline training. The techniques include a set of theoretical results on inherent properties of FSMs, and two newly designed dynamic optimizations for efficient FSM characterization. The new techniques, for the first time, make principle speculation applicable on the fly, and enables swift, automatic configuration of speculative parallelizations to best suit a given FSM and its current input. They eliminate the fundamental barrier for practical adoption of principle speculation for FSM parallelization. Experiments show that the new techniques give significantly higher speedups for some difficult FSM applications in the presence of input changes.}, number={4}, journal={ACM SIGPLAN NOTICES}, author={Zhao, Zhijia and Shen, Xipeng}, year={2015}, month={Apr}, pages={619–630} } @inproceedings{ding_shen_musuvathi_mytkowicz_2015, place={Stanford, CA}, title={TOP: A Framework for Enabling Algorithmic Optimizations for Distance-Related Problems}, booktitle={41st International Conference on Very Large Data Bases (VLDB 2015) : proceedings of the VLDB Endowment, volume 8, number 1-13, Kohala Coast, Hawaii, USA, 31 August-4 September 2015}, publisher={VLDB Endowment}, author={Ding, Yufei and Shen, Xipeng and Musuvathi, Madan and Mytkowicz, Todd}, editor={Li, Chen and Markl, VolkerEditors}, year={2015}, month={Aug} } @book{ding_shen_musuvathi_mytkowicz_2015, title={TOP: A Framework for Enabling Algorithmic Optimizations for Distance-Related Problems”}, number={TR-2015-3}, institution={North Carolina State University}, author={Ding, Yufei and Shen, Xipeng and Musuvathi, Madanlal and Mytkowicz, Todd}, year={2015} } @inbook{zhu_wu_shen_shen_wang_2015, title={Understanding Co-run Degradations on Integrated Heterogeneous Processors}, ISBN={9783319174723 9783319174730}, ISSN={0302-9743 1611-3349}, url={http://dx.doi.org/10.1007/978-3-319-17473-0_6}, DOI={10.1007/978-3-319-17473-0_6}, abstractNote={Co-runs of independent applications on systems with heterogeneous processors are common (data centers, mobile devices, etc.). There has been limited understanding on the influence of co-runners on such systems. The previous studys on this topic are on simulators with limited settings. In this work, we conduct a comprehensive investigation of the performance of co-running jobs on integrated heterogeneous processors. The investigation produces a list of interesting and counter-intuitive findings. It reveals some critical design issues in modern operating systems in supporting heterogeneous processors, and suggests some potential solutions at the levels of program transformation and OS design.}, booktitle={Languages and Compilers for Parallel Computing}, publisher={Springer International Publishing}, author={Zhu, Qi and Wu, Bo and Shen, Xipeng and Shen, Li and Wang, Zhiying}, year={2015}, pages={82–97} } @inproceedings{zhu_wu_shen_shen_wang_2015, title={Understanding co-run degradations on integrated heterogeneous processors}, volume={8967}, booktitle={Languages and compilers for parallel computing (lcpc 2014)}, author={Zhu, Q. and Wu, B. and Shen, X. P. and Shen, L. and Wang, Z. Y.}, year={2015}, pages={82–97} } @inproceedings{ding_zhao_shen_musuvathi_mytkowicz_2015, place={Lille, France}, title={Yinyang K-Means: A Drop-In Replacement of the Classic K-Means with Consistent Speedup}, volume={37}, booktitle={Proceedings of the 32nd International Conference on Machine Learning}, author={Ding, Yufei and Zhao, Yue and Shen, Xipeng and Musuvathi, Madan and Mytkowicz, Todd}, year={2015}, month={Jul}, pages={579–587} } @book{ding_shen_musuvathi_mytkowicz_2015, title={Yinyang K-Means: A Drop-In Replacement of the Classic K-Means with Consistent Speedup}, number={TR-2015-2}, institution={North Carolina State University}, author={Ding, Yufei and Shen, Xipeng and Musuvathi, Madanlal and Mytkowicz, Todd}, year={2015} } @article{zhao_wu_zhou_ding_sun_shen_wu_2014, title={Call Sequence Prediction through Probabilistic Calling Automata}, volume={49}, ISSN={["1558-1160"]}, DOI={10.1145/2714064.2660221}, abstractNote={Predicting a sequence of upcoming function calls is important for optimizing programs written in modern managed languages (e.g., Java, Javascript, C#.) Existing function call predictions are mainly built on statistical patterns, suitable for predicting a single call but not a sequence of calls. This paper presents a new way to enable call sequence prediction, which exploits program structures through Probabilistic Calling Automata (PCA), a new program representation that captures both the inherent ensuing relations among function calls, and the probabilistic nature of execution paths. It shows that PCA-based prediction outperforms existing predictions, yielding substantial speedup when being applied to guide Just-In-Time compilation. By enabling accurate, efficient call sequence prediction for the first time, PCA-based predictors open up many new opportunities for dynamic program optimizations.}, number={10}, journal={ACM SIGPLAN NOTICES}, author={Zhao, Zhijia and Wu, Bo and Zhou, Mingzhou and Ding, Yufei and Sun, Jianhua and Shen, Xipeng and Wu, Youfeng}, year={2014}, month={Oct}, pages={745–762} } @inproceedings{zhao_wu_shen_2014, title={Challenging the "embarrassingly sequential"}, ISBN={9781450323055}, url={http://dx.doi.org/10.1145/2541940.2541989}, DOI={10.1145/2541940.2541989}, abstractNote={Finite-State Machine (FSM) applications are important for many domains. But FSM computation is inherently sequential, making such applications notoriously difficult to parallelize. Most prior methods address the problem through speculations on simple heuristics, offering limited applicability and inconsistent speedups. This paper provides some principled understanding of FSM parallelization, and offers the first disciplined way to exploit application-specific information to inform speculations for parallelization. Through a series of rigorous analysis, it presents a probabilistic model that captures the relations between speculative executions and the properties of the target FSM and its inputs. With the formulation, it proposes two model-based speculation schemes that automatically customize themselves with the suitable configurations to maximize the parallelization benefits. This rigorous treatment yields near-linear speedup on applications that state-of-the-art techniques can barely accelerate.}, booktitle={Proceedings of the 19th international conference on Architectural support for programming languages and operating systems - ASPLOS '14}, publisher={ACM Press}, author={Zhao, Zhijia and Wu, Bo and Shen, Xipeng}, year={2014} } @inproceedings{ding_zhou_zhao_eisenstat_shen_2014, title={Finding the limit}, ISBN={9781450323055}, url={http://dx.doi.org/10.1145/2541940.2541945}, DOI={10.1145/2541940.2541945}, abstractNote={This work aims to find out the full potential of compilation scheduling for JIT-based runtime systems. Compilation scheduling determines the order in which the compilation units (e.g., functions) in a program are to be compiled or recompiled. It decides when what versions of the units are ready to run, and hence affects performance. But it has been a largely overlooked direction in JIT-related research, with some fundamental questions left open: How significant compilation scheduling is for performance, how good the scheduling schemes employed by existing runtime systems are, and whether a great potential exists for improvement. This study proves the strong NP-completeness of the problem, proposes a heuristic algorithm that yields near optimal schedules, examines the potential of two current scheduling schemes empirically, and explores the relations with JIT designs. It provides the first principled understanding to the complexity and potential of compilation scheduling, shedding some insights for JIT-based runtime system improvement.}, booktitle={Proceedings of the 19th international conference on Architectural support for programming languages and operating systems - ASPLOS '14}, publisher={ACM Press}, author={Ding, Yufei and Zhou, Mingzhou and Zhao, Zhijia and Eisenstat, Sarah and Shen, Xipeng}, year={2014} } @inproceedings{wang_wang_wu_yew_shen_yuan_li_feng_guan_2014, title={Localization of concurrency bugs using shared memory access pairs}, ISBN={9781450330138}, url={http://dx.doi.org/10.1145/2642937.2642972}, DOI={10.1145/2642937.2642972}, abstractNote={We propose an effective approach to automatically localize buggy shared memory accesses that trigger concurrency bugs. Compared to existing approaches, our approach has two advantages. First, as long as enough successful runs of a concurrent program are collected, our approach can localize buggy shared memory accesses even with only one single failed run captured, as opposed to the requirement of capturing multiple failed runs in existing approaches. This is a significant advantage because it is more difficult to capture the elusive failed runs than the successful runs in practice. Second, our approach exhibits more precise bug localization results because it also captures buggy shared memory accesses in those failed runs that terminate prematurely, which are often neglected in existing approaches. Based on this proposed approach, we also implement a prototype, named LOCON. Evaluation results on 16 common concurrency bugs show that all buggy shared memory accesses that trigger these bugs can be precisely localized by LOCON with only one failed run captured.}, booktitle={Proceedings of the 29th ACM/IEEE international conference on Automated software engineering - ASE '14}, publisher={ACM Press}, author={Wang, Wenwen and Wang, Zhenjiang and Wu, Chenggang and Yew, Pen-Chung and Shen, Xipeng and Yuan, Xiang and Li, Jianjun and Feng, Xiaobing and Guan, Yong}, year={2014} } @article{chen_wu_li_shen_2014, title={PORPLE: An Extensible Optimizer for Portable Data Placement on GPU}, ISSN={["1072-4451"]}, DOI={10.1109/micro.2014.20}, abstractNote={GPU is often equipped with complex memory systems, including globalmemory, texture memory, shared memory, constant memory, and variouslevels of cache. Where to place the data is important for theperformance of a GPU program. However, the decision is difficult for aprogrammer to make because of architecture complexity and thesensitivity of suitable data placements to input and architecturechanges.This paper presents PORPLE, a portable data placement engine thatenables a new way to solve the data placement problem. PORPLE consistsof a mini specification language, a source-to-source compiler, and a runtime data placer. The language allows an easy description of amemory system; the compiler transforms a GPU program into a formamenable to runtime profiling and data placement; the placer, based onthe memory description and data access patterns, identifies on the flyappropriate placement schemes for data and places themaccordingly. PORPLE is distinctive in being adaptive to program inputsand architecture changes, being transparent to programmers (in mostcases), and being extensible to new memory architectures. Ourexperiments on three types of GPU systems show that PORPLE is able toconsistently find optimal or near-optimal placement despite the largedifferences among GPU architectures and program inputs, yielding up to2.08X (1.59X on average) speedups on a set of regular and irregularGPU benchmarks.}, journal={2014 47TH ANNUAL IEEE/ACM INTERNATIONAL SYMPOSIUM ON MICROARCHITECTURE (MICRO)}, author={Chen, Guoyang and Wu, Bo and Li, Dong and Shen, Xipeng}, year={2014}, pages={88–100} } @inproceedings{zhao_zhou_shen_2014, title={SatScore}, ISBN={9781450329682}, url={http://dx.doi.org/10.1145/2632048.2632080}, DOI={10.1145/2632048.2632080}, abstractNote={Important for user experience on mobile devices, app launch responsiveness has received many recent attentions. This paper reveals a principled pitfall in previous studies. Most of these studies have used average reduction of response delays as the metric for responsiveness. Through a systematic user study and statistical analysis, this paper shows that the metric fails to faithfully reflect user experienced responsiveness. To avoid the pitfall, a straight-forward solution is to employ users' direct feedback as the responsiveness metric, which is unfortunately hard to obtain. This paper presents the promise of solving the dilemma through a SatScore model. It further demonstrates some new opportunities for responsiveness enhancement enabled by the SatScore model.}, booktitle={Proceedings of the 2014 ACM International Joint Conference on Pervasive and Ubiquitous Computing - UbiComp '14 Adjunct}, publisher={ACM Press}, author={Zhao, Zhijia and Zhou, Mingzhou and Shen, Xipeng}, year={2014} } @article{zhou_shen_gao_yiu_2014, title={Space-efficient multi-versioning for input-adaptive feedback-driven program optimizations}, volume={49}, DOI={10.1145/2714064.2660229}, abstractNote={Function versioning is an approach to addressing input-sensitivity of program optimizations. A major side effect of it is notable code size increase, which has been hindering its broad applications to large code bases and space-stringent environments. In this paper, we initiate a systematic exploration into the problem, providing answers to some fundamental questions: Given a space constraint, to which function we should apply versioning? How many versions of a function should we include in the final executable? Is the optimal selection feasible to do in polynomial time? This study proves selecting the best set of versions under a space constraint is NP-complete and proposes a heuristic algorithm named CHoGS which yields near optimal results in quadratic time. We implement the algorithm and conduct experiments through the IBM XL compilers. We observe significant performance enhancement with only slight code size increase; the results from CHoGS show factors of higher space efficiency than those from traditional hotness-based methods.}, number={10}, journal={ACM SIGPLAN Notices}, author={Zhou, M. Z. and Shen, Xipeng and Gao, Y. Q. and Yiu, G.}, year={2014}, pages={763–776} } @article{wu_zhao_zhang_jiang_shen_2013, title={Complexity analysis and algorithm design for reorganizing data to minimize non-coalesced memory accesses on GPU}, volume={48}, ISSN={0362-1340}, url={http://dx.doi.org/10.1145/2517327.2442523}, DOI={10.1145/2517327.2442523}, abstractNote={The performance of Graphic Processing Units (GPU) is sensitive to irregular memory references. Some recent work shows the promise of data reorganization for eliminating non-coalesced memory accesses that are caused by irregular references. However, all previous studies have employed simple, heuristic methods to determine the new data layouts to create. As a result, they either do not provide any performance guarantee or are effective to only some limited scenarios. This paper contributes a fundamental study to the problem. It systematically analyzes the inherent complexity of the problem in various settings, and for the first time, proves that the problem is NP-complete. It then points out the limitations of existing techniques and reveals that in practice, the essence for designing an appropriate data reorganization algorithm can be reduced to a tradeoff among space, time, and complexity. Based on that insight, it develops two new data reorganization algorithms to overcome the limitations of previous methods. Experiments show that an assembly composed of the new algorithms and a previous algorithm can circumvent the inherent complexity in finding optimal data layouts, making it feasible to minimize non-coalesced memory accesses for a variety of irregular applications and settings that are beyond the reach of existing techniques.}, number={8}, journal={ACM SIGPLAN Notices}, publisher={Association for Computing Machinery (ACM)}, author={Wu, Bo and Zhao, Zhijia and Zhang, Eddy Zheng and Jiang, Yunlian and Shen, Xipeng}, year={2013}, month={Aug}, pages={57} } @inproceedings{wang_wu_li_shen_yu_jiao_vetter_2013, place={Piscataway, NJ}, title={Exploring Hybrid Memory for GPU Energy Efficiency through Software-Hardware Co-Design}, DOI={10.1109/pact.2013.6618807}, abstractNote={Hybrid memory designs, such as DRAM plus Phase Change Memory (PCM), have shown some promise for alleviating power and density issues faced by traditional memory systems. But previous studies have concentrated on CPU systems with a modest level of parallelism. This work studies the problem in a massively parallel setting. Specifically, it investigates the special implications to hybrid memory imposed by the massive parallelism in GPU. It empirically shows that, contrary to promising results demonstrated for CPU, previous designs of PCM-based hybrid memory result in significant degradation to the energy efficiency of GPU. It reveals that the fundamental reason comes from a multi-facet mismatch between those designs and the massive parallelism in GPU. It presents a solution that centers around a close cooperation between compiler-directed data placement and hardware-assisted runtime adaptation. The co-design approach helps tap into the full potential of hybrid memory for GPU without requiring dramatic hardware changes over previous designs, yielding 6% and 49% energy saving on average compared to pure DRAM and pure PCM respectively, and keeping performance loss less than 2%.}, booktitle={Proceedings of the 22nd International Conference on Parallel Architectures and Compilation Techniques}, publisher={IEEE}, author={Wang, Bin and Wu, Bo and Li, Dong and Shen, Xipeng and Yu, Weikuan and Jiao, Yizheng and Vetter, Jeffrey}, year={2013} } @inbook{guo_shen_2013, title={Fine-Grained Treatment to Synchronizations in GPU-to-CPU Translation}, ISBN={9783642360350 9783642360367}, ISSN={0302-9743 1611-3349}, url={http://dx.doi.org/10.1007/978-3-642-36036-7_12}, DOI={10.1007/978-3-642-36036-7_12}, abstractNote={GPU-to-CPU translation may extend Graphics Processing Units (GPU) programs executions to multi-/many-core CPUs, and hence enable cross-device task migration and promote whole-system synergy. This paper describes some of our findings in treatment to GPU synchronizations during the translation process. We show that careful dependence analysis may allow a fine-grained treatment to synchronizations and reveal redundant computation at the instruction-instance level. Based on thread-level dependence graphs, we present a method to enable such fine-grained treatment automatically. Experiments demonstrate that compared to existing translations, the new approach can yield speedup of a factor of integers.}, booktitle={Languages and Compilers for Parallel Computing}, publisher={Springer Berlin Heidelberg}, author={Guo, Ziyu and Shen, Xipeng}, year={2013}, pages={171–184} } @article{zhao_bebenita_herman_sun_shen_2013, title={HPar}, volume={10}, ISSN={1544-3566}, url={http://dx.doi.org/10.1145/2541228.2555301}, DOI={10.1145/2541228.2555301}, abstractNote={ Parallelizing HTML parsing is challenging due to the complexities of HTML documents and the inherent dependencies in its parsing algorithm. As a result, despite numerous studies in parallel parsing, HTML parsing remains sequential today. It forms one of the final barriers for fully parallelizing browser operations to minimize the browser’s response time—an important variable for user experiences, especially on portable devices. This article provides a comprehensive analysis on the special complexities of parallel HTML parsing and presents a systematic exploration in overcoming those difficulties through specially designed speculative parallelizations. This work develops, to the best of our knowledge, the first pipelining and data-level parallel HTML parsers. The data-level parallel parser, named HPar , achieves up to 2.4× speedup on quadcore devices. This work demonstrates the feasibility of efficient, parallel HTML parsing for the first time and offers a set of novel insights for parallel HTML parsing }, number={4}, journal={ACM Transactions on Architecture and Code Optimization}, publisher={Association for Computing Machinery (ACM)}, author={Zhao, Zhijia and Bebenita, Michael and Herman, Dave and Sun, Jianhua and Shen, Xipeng}, year={2013}, month={Dec}, pages={1–25} } @inbook{tian_jiang_shen_mao_2013, title={Optimal Co-Scheduling to Minimize Makespan on Chip Multiprocessors}, ISBN={9783642358661 9783642358678}, ISSN={0302-9743 1611-3349}, url={http://dx.doi.org/10.1007/978-3-642-35867-8_7}, DOI={10.1007/978-3-642-35867-8_7}, abstractNote={On-chip resource sharing among sibling cores causes resource contention on Chip Multiprocessors (CMP), considerably degrading program performance and system fairness. Job co-scheduling attempts to alleviate the problem by assigning jobs to cores intelligently. Despite many heuristics-based empirical explorations, studies on optimal co-scheduling and its inherent complexity start only recently, and all have concentrated on the minimization of total performance degradations. There is another important criterion for scheduling, makespan, which determines the finish time of a job set. Its importance for job co-scheduling on CMP is increasingly recognized, especially with the rise of CMP-based compute cloud, data centers, and server farms. However, optimal co-scheduling for makespan minimization still remains unexplored. This work compares makespan minimization problem with previously studied cost minimization (or degradation minimization) problem, revealing these connections as well as significant differences. It uncovers the computational complexity of the makespan minimization problem, and proposes a series of algorithms to either compute or approximate the optimal schedules. It proves that the problem is NP-complete in a general setting, but for a special case (dual-core without job migrations), the problem is solvable in O(n 2.5·logn) time (n is the number of jobs). In addition, this paper presents a series of algorithms to compute or approximate the optimal schedules in the general setting. Experiments on both real and synthetic problems verify the optimality of the optimal co-scheduling algorithms, and demonstrate the reasonable accuracy and scalability of the approximation algorithms. The findings may advance the current understanding of optimal co-scheduling, facilitate the evaluation of real co-scheduling systems, and provide insights for the development of practical co-scheduling algorithms.}, booktitle={Job Scheduling Strategies for Parallel Processing}, publisher={Springer Berlin Heidelberg}, author={Tian, Kai and Jiang, Yunlian and Shen, Xipeng and Mao, Weizhen}, year={2013}, pages={114–133} } @article{zhou_wu_ding_shen_2013, title={Profmig: A framework for flexible migration of program profiles across software versions}, DOI={10.1109/cgo.2013.6494984}, abstractNote={Offline program profiling is costly, especially when software update is frequent. In this paper, we initiate a systematic exploration in cross-version program profile migration, which tries to effectively reuse the valid part of the behavior profiles of an old version of a software for a new version. We explore the effects imposed on profile reusability by the various factors in program behaviors, profile formats, and impact analysis, and introduce ProfMig, a framework for flexible migrations of various profiles. We demonstrate the effectiveness of the techniques on migrating loop trip-count profiles and dynamic call graphs. The migration saves significant (48-67% on average) profiling time with less than 10% accuracy compromised for most programs.}, journal={Proceedings of the 2013 IEEE/ACM International Symposium on Code Generation and Optimization (CGO)}, publisher={IEEE}, author={Zhou, Mingzhou and Wu, Bo and Ding, Yufei and Shen, Xipeng}, year={2013}, month={Feb} } @inbook{wu_zhou_shen_gao_silvera_yiu_2013, title={Simple Profile Rectifications Go a Long Way}, ISBN={9783642390371 9783642390388}, ISSN={0302-9743 1611-3349}, url={http://dx.doi.org/10.1007/978-3-642-39038-8_27}, DOI={10.1007/978-3-642-39038-8_27}, abstractNote={Feedback-driven program optimization (FDO) is common in modern compilers, including Just-In-Time compilers increasingly adopted for object-oriented or scripting languages. This paper describes a systematic study in understanding and alleviating the effects of sampling errors on the usefulness of the obtained profiles for FDO. Taking a statistical approach, it offers a series of counter-intuitive findings, and identifies two kinds of profile errors that affect FDO critically, namely zero-count errors and inconsistency errors. It further proposes statistical profile rectification, a simple approach to correcting profiling errors by leveraging statistical patterns in a profile. Experiments show that the simple approach enhances the effectiveness of sampled profile-based FDO dramatically, increasing the average FDO speedup from 1.16X to 1.3X, around 92% of what full profiles can yield.}, booktitle={ECOOP 2013 – Object-Oriented Programming}, publisher={Springer Berlin Heidelberg}, author={Wu, Bo and Zhou, Mingzhou and Shen, Xipeng and Gao, Yaoqing and Silvera, Raul and Yiu, Graham}, year={2013}, pages={654–678} } @article{shen_liu_zhang_bhamidipati_2012, title={An Infrastructure for Tackling Input-Sensitivity of GPU Program Optimizations}, volume={41}, ISSN={0885-7458 1573-7640}, url={http://dx.doi.org/10.1007/S10766-012-0236-3}, DOI={10.1007/S10766-012-0236-3}, number={6}, journal={International Journal of Parallel Programming}, publisher={Springer Science and Business Media LLC}, author={Shen, Xipeng and Liu, Yixun and Zhang, Eddy Z. and Bhamidipati, Poornima}, year={2012}, month={Dec}, pages={855–869} } @inproceedings{wu_zhao_shen_jiang_gao_silvera_2012, title={Exploiting inter-sequence correlations for program behavior prediction}, ISBN={9781450315616}, url={http://dx.doi.org/10.1145/2384616.2384678}, DOI={10.1145/2384616.2384678}, abstractNote={Prediction of program dynamic behaviors is fundamental to program optimizations, resource management, and architecture reconfigurations. Most existing predictors are based on locality of program behaviors, subject to some inherent limitations. In this paper, we revisit the design philosophy and systematically explore a second source of clues: statistical correlations between the behavior sequences of different program entities. Concentrated on loops, it examines the correlations' existence, strength, and values in enhancing the design of program behavior predictors. It creates the first taxonomy of program behavior sequence patterns. It develops a new form of predictors, named sequence predictors, to effectively translate the correlations into large-scope, proactive predictions of program behavior sequences. It demonstrates the usefulness of the prediction in dynamic version selection and loop importance estimation, showing 19% average speedup on a number of real-world utility applications. By taking scope and timing of behavior prediction as the first-order design objectives, the new approach overcomes limitations of existing program behavior predictors, opening up many new opportunities for runtime optimizations at various layers of computing.}, booktitle={Proceedings of the ACM international conference on Object oriented programming systems languages and applications - OOPSLA '12}, publisher={ACM Press}, author={Wu, Bo and Zhao, Zhijia and Shen, Xipeng and Jiang, Yunlian and Gao, Yaoqing and Silvera, Raul}, year={2012} } @inproceedings{guo_wu_shen_2012, title={One stone two birds}, ISBN={9781450313162}, url={http://dx.doi.org/10.1145/2304576.2304583}, DOI={10.1145/2304576.2304583}, abstractNote={As an approach to promoting whole-system synergy on a heterogeneous computing system, compilation of fine-grained SPMD-threaded code(e.g., GPU CUDA code) for multicore CPU has drawn some recent attentions. This paper concentrates on two important sources of inefficiency that limit existing translators. The first is overly strong synchronizations; the second is thread-level partially redundant computations. In this paper, we point out that both kinds of inefficiency essentially come from a single reason: the non-uniformity among threads. Based on that observation, we present a thread-level dependence analysis, which leads to a code generator with three novel features: an instance-level instruction scheduler for synchronization relaxation, a graph pattern recognition scheme for code shape optimization, and a fine-grained analysis for thread-level partial redundancy removal. Experiments show that the unified solution is effective in resolving both inefficiencies, yielding speedup as much as a factor of 14.}, booktitle={Proceedings of the 26th ACM international conference on Supercomputing - ICS '12}, publisher={ACM Press}, author={Guo, Ziyu and Wu, Bo and Shen, Xipeng}, year={2012} } @article{zhang_jiang_shen_2012, title={The Significance of CMP Cache Sharing on Contemporary Multithreaded Applications}, volume={23}, ISSN={1045-9219}, url={http://dx.doi.org/10.1109/TPDS.2011.130}, DOI={10.1109/TPDS.2011.130}, abstractNote={Cache sharing on modern Chip Multiprocessors (CMPs) reduces communication latency among corunning threads, and also causes interthread cache contention. Most previous studies on the influence of cache sharing have concentrated on the design or management of shared cache. The observed influence is often constrained by the reliance on simulators, the use of out-of-date benchmarks, or the limited coverage of deciding factors. This paper describes a systematic measurement of the influence with most of the potentially important factors covered. The measurement shows some surprising results. Contrary to commonly perceived importance of cache sharing, neither positive nor negative effects from the cache sharing are significant for most of the program executions in the PARSEC benchmark suite, regardless of the types of parallelism, input data sets, architectures, numbers of threads, and assignments of threads to cores. After a detailed analysis, we find that the main reason is the mismatch between the software design (and compilation) of multithreaded applications and CMP architectures. By performing source code transformations on the programs in a cache-sharing-aware manner, we observe up to 53 percent performance increase when the threads are placed on cores appropriately, confirming the software-hardware mismatch as a main reason for the observed insignificance of the influence from cache sharing, and indicating the important role of cache-sharing-aware transformations-a topic only sporadically studied so far-for exerting the power of shared cache.}, number={2}, journal={IEEE Transactions on Parallel and Distributed Systems}, publisher={Institute of Electrical and Electronics Engineers (IEEE)}, author={Zhang, Eddy Z. and Jiang, Yunlian and Shen, Xipeng}, year={2012}, month={Feb}, pages={367–374} } @inproceedings{tian_zhang_shen_2011, title={A step towards transparent integration of input-consciousness into dynamic program optimizations}, ISBN={9781450309400}, url={http://dx.doi.org/10.1145/2048066.2048103}, DOI={10.1145/2048066.2048103}, abstractNote={Dynamic program optimizations are critical for the efficiency of applications in managed programming languages and scripting languages. Recent studies have shown that exploitation of program inputs may enhance the effectiveness of dynamic optimizations significantly. However, current solutions for enabling the exploitation require either programmers' annotations or intensive offline profiling, impairing the practical adoption of the techniques. This current work examines the basic feasibility of transparent integration of input-consciousness into dynamic program optimizations, particularly in managed execution environments. It uses transparent learning across production runs as the basic vehicle, and investigates the implications of cross-run learning on each main component of input-conscious dynamic optimizations. It proposes several techniques to address some key challenges for the transparent integration, including randomized inspection-instrumentation for cross-user data collection, a sparsity-tolerant algorithm for input characterization, and selective prediction for efficiency protection. These techniques make it possible to automatically recognize the relations between the inputs to a program and the appropriate ways to optimize it. The whole process happens transparently across production runs; no need for offline profiling or programmer intervention. Experiments on a number of Java programs demonstrate the effectiveness of the techniques in enabling input-consciousness for dynamic optimizations, revealing the feasibility and potential benefits of the new optimization paradigm in some basic settings.}, booktitle={Proceedings of the 2011 ACM international conference on Object oriented programming systems languages and applications - OOPSLA '11}, publisher={ACM Press}, author={Tian, Kai and Zhang, Eddy and Shen, Xipeng}, year={2011} } @inbook{jiang_zhang_shen_gao_archambault_2011, title={Array Regrouping on CMP with Non-uniform Cache Sharing}, ISBN={9783642195945 9783642195952}, ISSN={0302-9743 1611-3349}, url={http://dx.doi.org/10.1007/978-3-642-19595-2_7}, DOI={10.1007/978-3-642-19595-2_7}, abstractNote={Array regrouping enhances program spatial locality by interleaving elements of multiple arrays that tend to be accessed closely. Its effectiveness has been systematically studied for sequential programs running on unicore processors, but not for multithreading programs on modern Chip Multiprocessor (CMP) machines. On one hand, the processor-level parallelism on CMP intensifies memory bandwidth pressure, suggesting the potential benefits of array regrouping for CMP computing. On the other hand, CMP architectures exhibit extra complexities—especially the hierarchical, heterogeneous cache sharing among hyperthreads, cores, and processors—that impose new challenges to array regrouping. In this work, we initiate an exploration to the new opportunities and challenges. We propose cache-sharing-aware reference affinity analysis for identifying data affinity in multithreading applications. The analysis consists of affinity-guided thread scheduling and hierarchical reference-vector merging, handles cache sharing among both hyperthreads and cores, and offers hints for array regrouping and the avoidance of false sharing. Preliminary experiments demonstrate the potential of the techniques in improving locality of multithreading applications on CMP with various pitfalls avoided.}, booktitle={Languages and Compilers for Parallel Computing}, publisher={Springer Berlin Heidelberg}, author={Jiang, Yunlian and Zhang, Eddy Z. and Shen, Xipeng and Gao, Yaoqing and Archambault, Roch}, year={2011}, pages={92–105} } @inproceedings{guo_zhang_shen_2011, title={Correctly Treating Synchronizations in Compiling Fine-Grained SPMD-Threaded Programs for CPU}, ISBN={9781457717949 9780769545660}, url={http://dx.doi.org/10.1109/pact.2011.62}, DOI={10.1109/pact.2011.62}, abstractNote={Automatic compilation for multiple types of devices is important, especially given the current trends towards heterogeneous computing. This paper concentrates on some issues in compiling fine-grained SPMD-threaded code (e.g., GPU CUDA code) for multicore CPUs. It points out some correctness pitfalls in existing techniques, particularly in their treatment to implicit synchronizations. It then describes a systematic dependence analysis specially designed for handling implicit synchronizations in SPMD-threaded programs. By unveiling the relations between inter-thread data dependences and correct treatment to synchronizations, it presents a dependence-based solution to the problem. Experiments demonstrate that the proposed techniques can resolve the correctness issues in existing compilation techniques, and help compilers produce correct and efficient translation results.}, booktitle={2011 International Conference on Parallel Architectures and Compilation Techniques}, publisher={IEEE}, author={Guo, Ziyu and Zhang, Eddy Zheng and Shen, Xipeng}, year={2011}, month={Oct} } @inproceedings{wu_zhang_shen_2011, title={Enhancing Data Locality for Dynamic Simulations through Asynchronous Data Transformations and Adaptive Control}, ISBN={9781457717949 9780769545660}, url={http://dx.doi.org/10.1109/pact.2011.56}, DOI={10.1109/pact.2011.56}, abstractNote={Many dynamic simulation programs contain complex, irregular memory reference patterns, and require runtime optimizations to enhance data locality. Current approaches periodically stop the execution of an application to reorder the computation or data based on the current program state to improve the data locality for the next period of execution. In this work, we examine the implications that modern heterogeneous Chip Multiprocessors (CMP) architecture imposes on the optimization paradigm. We develop three techniques to enhance the optimizations. The first is asynchronous data transformation, which moves data reordering off the critical path through dependence circumvention. The second is a novel data transformation algorithm, named TLayout, designed specially to take advantage of modern throughput-oriented processors. Together they provide two complementary ways to attack a benefit-overhead dilemma inherited in traditional techniques. Working with a dynamic adaptation scheme, the techniques produce significant performance improvement for a set of dynamic simulation benchmarks.}, booktitle={2011 International Conference on Parallel Architectures and Compilation Techniques}, publisher={IEEE}, author={Wu, Bo and Zhang, Eddy Z. and Shen, Xipeng}, year={2011}, month={Oct} } @inproceedings{zhang_jiang_guo_tian_shen_2011, title={On-the-fly elimination of dynamic irregularities for GPU computing}, ISBN={9781450302661}, url={http://dx.doi.org/10.1145/1950365.1950408}, DOI={10.1145/1950365.1950408}, abstractNote={The power-efficient massively parallel Graphics Processing Units (GPUs) have become increasingly influential for general-purpose computing over the past few years. However, their efficiency is sensitive to dynamic irregular memory references and control flows in an application. Experiments have shown great performance gains when these irregularities are removed. But it remains an open question how to achieve those gains through software approaches on modern GPUs. This paper presents a systematic exploration to tackle dynamic irregularities in both control flows and memory references. It reveals some properties of dynamic irregularities in both control flows and memory references, their interactions, and their relations with program data and threads. It describes several heuristics-based algorithms and runtime adaptation techniques for effectively removing dynamic irregularities through data reordering and job swapping. It presents a framework, G-Streamline, as a unified software solution to dynamic irregularities in GPU computing. G-Streamline has several distinctive properties. It is a pure software solution and works on the fly, requiring no hardware extensions or offline profiling. It treats both types of irregularities at the same time in a holistic fashion, maximizing the whole-program performance by resolving conflicts among optimizations. Its optimization overhead is largely transparent to GPU kernel executions, jeopardizing no basic efficiency of the GPU application. Finally, it is robust to the presence of various complexities in GPU applications. Experiments show that G-Streamline is effective in reducing dynamic irregularities in GPU computing, producing speedups between 1.07 and 2.5 for a variety of applications.}, booktitle={Proceedings of the sixteenth international conference on Architectural support for programming languages and operating systems - ASPLOS '11}, publisher={ACM Press}, author={Zhang, Eddy Z. and Jiang, Yunlian and Guo, Ziyu and Tian, Kai and Shen, Xipeng}, year={2011} } @article{jiang_tian_shen_zhang_chen_tripathi_2011, title={The Complexity of Optimal Job Co-Scheduling on Chip Multiprocessors and Heuristics-Based Solutions}, volume={22}, ISSN={1045-9219}, url={http://dx.doi.org/10.1109/TPDS.2010.193}, DOI={10.1109/TPDS.2010.193}, abstractNote={In Chip Multiprocessors (CMPs) architecture, it is common that multiple cores share some on-chip cache. The sharing may cause cache thrashing and contention among co-running jobs. Job co-scheduling is an approach to tackling the problem by assigning jobs to cores appropriately so that the contention and consequent performance degradations are minimized. Job co-scheduling includes two tasks: the estimation of co-run performance, and the determination of suitable co-schedules. Most existing studies in job co-scheduling have concentrated on the first task but relies on simple techniques (e.g., trying different schedules) for the second. This paper presents a systematic exploration to the second task. The paper uncovers the computational complexity of the determination of optimal job co-schedules, proving its NP-completeness. It introduces a set of algorithms, based on graph theory and Integer/Linear Programming, for computing optimal co-schedules or their lower bounds in scenarios with or without job migrations. For complex cases, it empirically demonstrates the feasibility for approximating the optimal effectively by proposing several heuristics-based algorithms. These discoveries may facilitate the assessment of job co-schedulers by providing necessary baselines, as well as shed insights to the development of co-scheduling algorithms in practical systems.}, number={7}, journal={IEEE Transactions on Parallel and Distributed Systems}, publisher={Institute of Electrical and Electronics Engineers (IEEE)}, author={Jiang, Yunlian and Tian, Kai and Shen, Xipeng and Zhang, Jinghe and Chen, Jie and Tripathi, Rahul}, year={2011}, month={Jul}, pages={1192–1205} } @inproceedings{tian_jiang_zhang_shen_2010, title={An input-centric paradigm for program dynamic optimizations}, ISBN={9781450302036}, url={http://dx.doi.org/10.1145/1869459.1869471}, DOI={10.1145/1869459.1869471}, abstractNote={Accurately predicting program behaviors (e.g., locality, dependency, method calling frequency) is fundamental for program optimizations and runtime adaptations. Despite decades of remarkable progress, prior studies have not systematically exploited program inputs, a deciding factor for program behaviors. Triggered by the strong and predictive correlations between program inputs and behaviors that recent studies have uncovered, this work proposes to include program inputs into the focus of program behavior analysis, cultivating a new paradigm named input-centric program behavior analysis. This new approach consists of three components, forming a three-layer pyramid. At the base is program input characterization, a component for resolving the complexity in program raw inputs and the extraction of important features. In the middle is input-behavior modeling, a component for recognizing and modeling the correlations between characterized input features and program behaviors. These two components constitute input-centric program behavior analysis, which (ideally) is able to predict the large-scope behaviors of a program's execution as soon as the execution starts. The top layer of the pyramid is input-centric adaptation, which capitalizes on the novel opportunities that the first two components create to facilitate proactive adaptation for program optimizations. By centering on program inputs, the new approach resolves a proactivity-adaptivity dilemma inherent in previous techniques. Its benefits are demonstrated through proactive dynamic optimizations and version selection, yielding significant performance improvement on a set of Java and C programs.}, booktitle={Proceedings of the ACM international conference on Object oriented programming systems languages and applications - OOPSLA '10}, publisher={ACM Press}, author={Tian, Kai and Jiang, Yunlian and Zhang, Eddy Z. and Shen, Xipeng}, year={2010} } @inbook{jiang_tian_shen_2010, title={Combining Locality Analysis with Online Proactive Job Co-scheduling in Chip Multiprocessors}, ISBN={9783642115141 9783642115158}, ISSN={0302-9743 1611-3349}, url={http://dx.doi.org/10.1007/978-3-642-11515-8_16}, DOI={10.1007/978-3-642-11515-8_16}, abstractNote={The shared-cache contention on Chip Multiprocessors causes performance degradation to applications and hurts system fairness. Many previously proposed solutions schedule programs according to runtime sampled cache performance to reduce cache contention. The strong dependence on runtime sampling inherently limits the scalability and effectiveness of those techniques. This work explores the combination of program locality analysis with job co-scheduling. The rationale is that program locality analysis typically offers a large-scope view of various facets of an application including data access patterns and cache requirement. That knowledge complements the local behaviors sampled by runtime systems. The combination offers the key to overcoming the limitations of prior co-scheduling techniques.Specifically, this work develops a lightweight locality model that enables efficient, proactive prediction of the performance of co-running processes, offering the potential for an integration in online scheduling systems. Compared to existing multicore scheduling systems, the technique reduces performance degradation by 34% (7% performance improvement) and unfairness by 47%. Its proactivity makes it resilient to the scalability issues that constraints the applicability of previous techniques.}, booktitle={High Performance Embedded Architectures and Compilers}, publisher={Springer Berlin Heidelberg}, author={Jiang, Yunlian and Tian, Kai and Shen, Xipeng}, year={2010}, pages={201–215} } @inproceedings{zhang_jiang_shen_2010, title={Does cache sharing on modern CMP matter to the performance of contemporary multithreaded programs?}, ISBN={9781605587080}, url={http://dx.doi.org/10.1145/1693453.1693482}, DOI={10.1145/1693453.1693482}, abstractNote={Most modern Chip Multiprocessors (CMP) feature shared cache on chip. For multithreaded applications, the sharing reduces communication latency among co-running threads, but also results in cache contention. A number of studies have examined the influence of cache sharing on multithreaded applications, but most of them have concentrated on the design or management of shared cache, rather than a systematic measurement of the influence. Consequently, prior measurements have been constrained by the reliance on simulators, the use of out-of-date benchmarks, and the limited coverage of deciding factors. The influence of CMP cache sharing on contemporary multithreaded applications remains preliminarily understood. In this work, we conduct a systematic measurement of the influence on two kinds of commodity CMP machines, using a recently released CMP benchmark suite, PARSEC, with a number of potentially important factors on program, OS, and architecture levels considered. The measurement shows some surprising results. Contrary to commonly perceived importance of cache sharing, neither positive nor negative effects from the cache sharing are significant for most of the program executions, regardless of the types of parallelism, input datasets, architectures, numbers of threads, and assignments of threads to cores. After a detailed analysis, we find that the main reason is the mismatch of current development and compilation of multithreaded applications and CMP architectures. By transforming the programs in a cache-sharing-aware manner, we observe up to 36% performance increase when the threads are placed on cores appropriately.}, booktitle={Proceedings of the 15th ACM SIGPLAN symposium on Principles and practice of parallel programming - PPoPP '10}, publisher={ACM Press}, author={Zhang, Eddy Z. and Jiang, Yunlian and Shen, Xipeng}, year={2010} } @book{albert_paloski_shen_walter_zhang_2010, title={Experiences in Porting the Hubbard Model in Computational Materials Science to GPU}, number={WM-CS-2010-04}, institution={Computer Science Department, The College of William and Mary}, author={Albert, Chase and Paloski, Aaron and Shen, Xipeng and Walter, Eric J. and Zhang, Shiwei}, year={2010} } @inproceedings{jiang_zhang_tian_mao_gethers_shen_gao_2010, title={Exploiting statistical correlations for proactive prediction of program behaviors}, ISBN={9781605586359}, url={http://dx.doi.org/10.1145/1772954.1772989}, DOI={10.1145/1772954.1772989}, abstractNote={This paper presents a finding and a technique on program behavior prediction. The finding is that surprisingly strong statistical correlations exist among the behaviors of different program components (e.g., loops) and among different types of program level behaviors (e.g., loop trip-counts versus data values). Furthermore, the correlations can be beneficially exploited: They help resolve the proactivity-adaptivity dilemma faced by existing program behavior predictions, making it possible to gain the strengths of both approaches--the large scope and earliness of offline-profiling--based predictions, and the cross-input adaptivity of runtime sampling-based predictions. The main technique contributed by this paper centers on a new concept, seminal behaviors. Enlightened by the existence of strong correlations among program behaviors, we propose a regression based framework to automatically identify a small set of behaviors that can lead to accurate prediction of other behaviors in a program. We call these seminal behaviors. By applying statistical learning techniques, the framework constructs predictive models that map from seminal behaviors to other behaviors, enabling proactive and cross-input adaptive prediction of program behaviors. The prediction helps a commercial compiler, the IBM XL C compiler, generate code that runs up to 45% faster (5%-13% on average), demonstrating the large potential of correlation-based techniques for program optimizations.}, booktitle={Proceedings of the 8th annual IEEE/ ACM international symposium on Code generation and optimization - CGO '10}, publisher={ACM Press}, author={Jiang, Yunlian and Zhang, Eddy Z. and Tian, Kai and Mao, Feng and Gethers, Malcom and Shen, Xipeng and Gao, Yaoqing}, year={2010} } @book{kowalski_shen_2010, title={Implementing the Dslash Operator in OpenCL}, number={WM-CS-2010-03}, institution={Computer Science Department, The College of William and Mary}, author={Kowalski, Andy and Shen, Xipeng}, year={2010} } @inbook{jiang_zhang_tian_shen_2010, title={Is Reuse Distance Applicable to Data Locality Analysis on Chip Multiprocessors?}, ISBN={9783642119699 9783642119705}, ISSN={0302-9743 1611-3349}, url={http://dx.doi.org/10.1007/978-3-642-11970-5_15}, DOI={10.1007/978-3-642-11970-5_15}, abstractNote={On Chip Multiprocessors (CMP), it is common that multiple cores share certain levels of cache. The sharing increases the contention in cache and memory-to-chip bandwidth, further highlighting the importance of data locality analysis. As a rigorous and hardware-independent locality metric, reuse distance has served for a variety of locality analysis, program transformations, and performance prediction. However, previous studies have concentrated on sequential programs running on unicore processors. On CMP, accesses by different threads (or jobs) interact in the shared cache. How reuse distance applies to the new architecture remains an open question—particularly, how the interactions in shared cache affect the collection and application of reuse distance, and how reuse-distance–based locality analysis should adapt to such architecture changes. This paper presents our explorations towards answering those questions. It first introduces the concept of concurrent reuse distance, a direct extension of the traditional concept of reuse distance with data references by all co-running threads (or jobs) considered. It then discusses the properties of concurrent reuse distance, revealing the special challenges facing the collection and application of concurrent reuse distance on CMP platforms. Finally, it presents the solutions to those challenges for a class of multithreading applications. The solutions center on a probabilistic model that connects concurrent reuse distance with the data locality of each individual thread. Experiments demonstrate the effectiveness of the proposed techniques in facilitating the uses of concurrent reuse distance for CMP computing.}, booktitle={Lecture Notes in Computer Science}, publisher={Springer Berlin Heidelberg}, author={Jiang, Yunlian and Zhang, Eddy Z. and Tian, Kai and Shen, Xipeng}, year={2010}, pages={264–282} } @inbook{mao_shen_2010, title={LU Decomposition on Cell Broadband Engine: An Empirical Study to Exploit Heterogeneous Chip Multiprocessors}, ISBN={9783642156717 9783642156724}, ISSN={0302-9743 1611-3349}, url={http://dx.doi.org/10.1007/978-3-642-15672-4_7}, DOI={10.1007/978-3-642-15672-4_7}, abstractNote={To meet the needs of high performance computing, the Cell Broadband Engine owns many features that differ from traditional processors, such as the large number of synergistic processor elements, large register files, the ability to hide main-storage latency with concurrent computation and DMA transfers. The exploitation of those features requires the programmer to carefully tailor programs and simutaneously deal with various performance factors, including locality, load balance, communication overhead, and multi-level parallelism. These factors, unfortunately, are dependent on each other; an optimization that enhances one factor may degrade another. This paper presents our experience on optimizing LU decomposition, one of the commonly used algebra kernels in scientific computing, on Cell Broadband Engine. The optimizations exploit task-level, data-level, and communication-level parallelism. We study the effects of different task distribution strategies, prefetch, and software cache, and explore the tradeoff among different performance factors, stressing the interactions between different optimizations. This work offers some insights in the optimizations on heterogenous multi-core processors, including the selection of programming models, considerations in task distribution, and the holistic perspective required in optimizations.}, booktitle={Lecture Notes in Computer Science}, publisher={Springer Berlin Heidelberg}, author={Mao, Feng and Shen, Xipeng}, year={2010}, pages={61–75} } @inproceedings{zhang_jiang_guo_shen_2010, title={Streamlining GPU applications on the fly}, ISBN={9781450300186}, url={http://dx.doi.org/10.1145/1810085.1810104}, DOI={10.1145/1810085.1810104}, abstractNote={Because of their tremendous computing power and remarkable cost efficiency, GPUs (graphic processing unit) have quickly emerged as a kind of influential platform for high performance computing. However, as GPUs are designed for massive data-parallel computing, their performance is subject to the presence of condition statements in a GPU application. On a conditional branch where threads diverge in which path to take, the threads taking different paths have to run serially. Such divergences often cause serious performance degradations, impairing the adoption of GPU for many applications that contain non-trivial branches or certain types of loops. This paper presents a systematic investigation in the employment of runtime thread-data remapping for solving that problem. It introduces an abstract form of GPU applications, based on which, it describes the use of reference redirection and data layout transformation for remapping data and threads to minimize thread divergences. It discusses the major challenges for practical deployment of the remapping techniques, most notably, the conflict between the large remapping overhead and the need for the remapping to happen on the fly because of the dependence of thread divergences on runtime values. It offers a solution to the challenge by proposing a CPU-GPU pipelining scheme and a label-assign-move (LAM) algorithm to virtually hide all the remapping overhead. At the end, it reports significant performance improvement produced by the remapping for a set of GPU applications, demonstrating the potential of the techniques for streamlining GPU applications on the fly.}, booktitle={Proceedings of the 24th ACM International Conference on Supercomputing - ICS '10}, publisher={ACM Press}, author={Zhang, Eddy Z. and Jiang, Yunlian and Guo, Ziyu and Shen, Xipeng}, year={2010} } @book{zhang_jiang_shen_2009, title={A Systematic Measurement of the Influence of Non-Uniform Cache Sharing on the Performance of Modern Multithreaded Programs}, number={WM-CS-2009-04}, institution={Computer Science Department, The College of William and Mary}, author={Zhang, Eddy Z. and Jiang, Yunlian and Shen, Xipeng}, year={2009} } @inproceedings{liu_zhang_shen_2009, title={A cross-input adaptive framework for GPU program optimizations}, ISBN={9781424437511}, url={http://dx.doi.org/10.1109/ipdps.2009.5160988}, DOI={10.1109/ipdps.2009.5160988}, abstractNote={Recent years have seen a trend in using graphic processing units (GPU) as accelerators for general-purpose computing. The inexpensive, single-chip, massively parallel architecture of GPU has evidentially brought factors of speedup to many numerical applications. However, the development of a high-quality GPU application is challenging, due to the large optimization space and complex unpredictable effects of optimizations on GPU program performance. Recently, several studies have attempted to use empirical search to help the optimization. Although those studies have shown promising results, one important factor—program inputs—in the optimization has remained unexplored. In this work, we initiate the exploration in this new dimension. By conducting a series of measurement, we find that the ability to adapt to program inputs is important for some applications to achieve their best performance on GPU. In light of the findings, we develop an input-adaptive optimization framework, namely G-ADAPT, to address the influence by constructing cross-input predictive models for automatically predicting the (near-)optimal configurations for an arbitrary input to a GPU program. The results demonstrate the promise of the framework in serving as a tool to alleviate the productivity bottleneck in GPU programming.}, booktitle={2009 IEEE International Symposium on Parallel & Distributed Processing}, publisher={IEEE}, author={Liu, Yixun and Zhang, Eddy Z. and Shen, Xipeng}, year={2009}, month={May} } @inproceedings{tian_jiang_shen_2009, title={A study on optimally co-scheduling jobs of different lengths on chip multiprocessors}, ISBN={9781605584133}, url={http://dx.doi.org/10.1145/1531743.1531752}, DOI={10.1145/1531743.1531752}, abstractNote={Cache sharing in Chip Multiprocessors brings cache contention among corunning processes, which often causes considerable degradation of program performance and system fairness. Recent studies have seen the effectiveness of job co-scheduling in alleviating the contention. But finding optimal schedules is challenging. Previous explorations tackle the problem under highly constrained settings. In this work, we show that relaxing those constraints, particularly the assumptions on job lengths and reschedulings, increases the complexity of the problem significantly. Subsequently, we propose a series of algorithms to compute or approximate the optimal schedules in the more general setting. Specifically, we propose an A*-based approach to accelerating the search for optimal schedules by as much as several orders of magnitude. For large problems, we design and evaluate two approximation algorithms, A*-cluster and local-matching algorithms, to effectively approximate the optimal schedules with good accuracy and high scalability. This study contributes better understanding to the optimal co-scheduling problem, facilitates the evaluation of co-scheduling systems, and offers insights and techniques for the development of practical co-scheduling algorithms.}, booktitle={Proceedings of the 6th ACM conference on Computing frontiers - CF '09}, publisher={ACM Press}, author={Tian, Kai and Jiang, Yunlian and Shen, Xipeng}, year={2009} } @book{shen_jiang_2009, title={Co-Run Locality Prediction for Proactive Shared-Cache Management}, number={WM-CS-2009-03}, institution={Computer Science Department, The College of William and Mary}, author={Shen, Xipeng and Jiang, Yunlian}, year={2009} } @inproceedings{mao_shen_2009, title={Cross-Input Learning and Discriminative Prediction in Evolvable Virtual Machines}, ISBN={9780769535760}, url={http://dx.doi.org/10.1109/cgo.2009.10}, DOI={10.1109/cgo.2009.10}, abstractNote={Modern languages like Java and C# rely on dynamic optimizations in virtual machines for better performance. Current dynamic optimizations are reactive. Their performance is constrained by the dependence on runtime sampling and the partial knowledge of the execution. This work tackles the problems by developing a set of techniques that make a virtual machine evolve across production runs. The virtual machine incrementally learns the relation between program inputs and optimization strategies so that it proactively predicts the optimizations suitable for a new run. The prediction is discriminative, guarded by confidence measurement through dynamic self-evaluation. We employ an enriched extensible specification language to resolve the complexities in program inputs. These techniques, implemented in Jikes RVM, produce significant performance improvement on a set of Java applications.}, booktitle={2009 International Symposium on Code Generation and Optimization}, publisher={IEEE}, author={Mao, Feng and Shen, Xipeng}, year={2009}, month={Mar} } @inproceedings{mao_zhang_shen_2009, title={Influence of program inputs on the selection of garbage collectors}, ISBN={9781605583754}, url={http://dx.doi.org/10.1145/1508293.1508307}, DOI={10.1145/1508293.1508307}, abstractNote={Many studies have shown that the best performer among a set of garbage collectors tends to be different for different applications. Researchers have proposed application-specific selection of garbage collectors. In this work, we concentrate on a second dimension of the problem: the influence of program inputs on the selection of garbage collectors. We collect tens to hundreds of inputs for a set of Java benchmarks, and measure their performance on Jikes RVM with different heap sizes and garbage collectors. A rigorous statistical analysis produces four-fold insights. First, inputs influence the relative performance of garbage collectors significantly, causing large variations to the top set of garbage collectors across inputs. Profiling one or few runs is thus inadequate for selecting the garbage collector that works well for most inputs. Second, when the heap size ratio is fixed, one or two types of garbage collectors are enough to stimulate the top performance of the program on all inputs. Third, for some programs, the heap size ratio significantly affects the relative performance of different types of garbage collectors. For the selection of garbage collectors on those programs, it is necessary to have a cross-input predictive model that predicts the minimum possible heap size of the execution on an arbitrary input. Finally, based on regression techniques, we demonstrate the predictability of the minimum possible heap size, indicating the potential feasibility of the input-specific selection of garbage collectors.}, booktitle={Proceedings of the 2009 ACM SIGPLAN/SIGOPS international conference on Virtual execution environments - VEE '09}, publisher={ACM Press}, author={Mao, Feng and Zhang, Eddy Z. and Shen, Xipeng}, year={2009} } @book{shen_jiang_zhang_tan_mao_gethers_2009, title={Program Seminal Behaviors: Automating Input Characterization for Large-Scope Proactive Behavior Prediction}, number={WM-CS-2009-07}, institution={Computer Science Department, The College of William and Mary}, author={Shen, Xipeng and Jiang, Yunlian and Zhang, Eddy Z. and Tan, Kai and Mao, Feng and Gethers, Malcom}, year={2009} } @article{zhong_shen_ding_2009, title={Program locality analysis using reuse distance}, volume={31}, ISSN={0164-0925}, url={http://dx.doi.org/10.1145/1552309.1552310}, DOI={10.1145/1552309.1552310}, abstractNote={ On modern computer systems, the memory performance of an application depends on its locality. For a single execution, locality-correlated measures like average miss rate or working-set size have long been analyzed using reuse distance —the number of distinct locations accessed between consecutive accesses to a given location. This article addresses the analysis problem at the program level, where the size of data and the locality of execution may change significantly depending on the input. }, number={6}, journal={ACM Transactions on Programming Languages and Systems}, publisher={Association for Computing Machinery (ACM)}, author={Zhong, Yutao and Shen, Xipeng and Ding, Chen}, year={2009}, month={Aug}, pages={1–39} } @book{jiang_shen_2009, place={Williamsburg, VA}, title={Speculation with Little Wasting: Saving Cost in Software Speculation Through Transparent Learning}, number={WM-CS-2009-08}, institution={Computer Science Department, The College of William and Mary}, author={Jiang, Yunlian and Shen, Xipeng}, year={2009} } @inproceedings{jiang_mao_shen_2009, title={Speculation with Little Wasting: Saving Cost in Software Speculation through Transparent Learning}, ISBN={9781424457885}, url={http://dx.doi.org/10.1109/ICPADS.2009.130}, DOI={10.1109/ICPADS.2009.130}, abstractNote={Software speculation has shown promise in parallelizing programs with coarse-grained dynamic parallelism. However, most speculation systems use offline profiling for the selection of speculative regions. The mismatch with the input-sensitivity of dynamic parallelism may result in large numbers of speculation failures in many applications. Although with certain protection, the failed speculations may not hurt the basic efficiency of the application, the wasted computing resource (e.g. CPU time and power consumption) may severely degrade system throughput and efficiency. The importance of this issue continuously increases with the advent of multicore and parallelization in portable devices and multiprogramming environments. In this work, we propose the use of transparent statistical learning to make speculation cross-input adaptive. Across production runs of an application, the technique recognizes the patterns of the profitability of the speculative regions in the application and the relation between the profitability and program inputs. On a new run, the profitability of the regions are predicted accordingly and the speculations are switched on and off adaptively. The technique differs from previous techniques in that it requires no explicit training, but is able to adapt to changes in program inputs. It is applicable to both loop-level and function-level parallelism by learning across iterations and executions, permitting arbitrary depth of speculations. Its implementation in a recent software speculation system, namely the Behavior-Oriented Parallelization system, shows substantial reduction of speculation cost with negligible decrease (sometimes, considerable increase) of parallel execution performance.}, booktitle={2009 15th International Conference on Parallel and Distributed Systems}, publisher={IEEE}, author={Jiang, Yunlian and Mao, Feng and Shen, Xipeng}, year={2009} } @book{zhang_jiang_guo_shen_2009, place={Williamsburg, VA}, title={Streamlining GPU Applications On the Fly – Thread Divergence Elimination through Runtime Thread-Data Remapping}, number={WM-CS-2009-08}, institution={Computer Science Department, The College of William and Mary}, author={Zhang, Eddy Z. and Jiang, Yunlian and Guo, Ziyu and Shen, Xipeng}, year={2009} } @article{shen_mao_tian_zhang_2009, title={The study and handling of program inputs in the selection of garbage collectors}, volume={43}, ISSN={0163-5980}, url={http://dx.doi.org/10.1145/1618525.1618531}, DOI={10.1145/1618525.1618531}, abstractNote={Many studies have shown that the best performer among a set of garbage collectors tends to be different for different applications. Researchers have proposed applicationspecific selection of garbage collectors. In this work, we concentrate on a second dimension of the problem: the influence of program inputs on the selection of garbage collectors. We collect tens to hundreds of inputs for a set of Java benchmarks, and measure their performance on Jikes RVM with different heap sizes and garbage collectors. A rigorous statistical analysis produces four-fold insights. First, inputs influence the relative performance of garbage collectors significantly, causing large variations to the top set of garbage collectors across inputs. Profiling one or few runs is thus inadequate for selecting the garbage collector that works well for most inputs. Second, when the heap size ratio is fixed, one or two types of garbage collectors are enough to stimulate the top performance of the program on all inputs. Third, for some programs, the heap size ratio significantly affects the relative performance of different types of garbage collectors. For the selection of garbage collectors on those programs, it is necessary to have a cross-input predictive model that predicts the minimum possible heap size of the execution on an arbitrary input. Finally, by adoptingstatistical learning techniques, we investigate the cross-input predictability of the influence. Experimental results demonstrate that with regression and classification techniques, it is possible to predict the best garbage collector (along with the minimum possible heap size) with reasonable accuracy given an arbitrary input to an application. The exploration opens the opportunities for tailoring the selection of garbage collectors to not only applications but also their inputs.}, number={3}, journal={ACM SIGOPS Operating Systems Review}, publisher={Association for Computing Machinery (ACM)}, author={Shen, Xipeng and Mao, Feng and Tian, Kai and Zhang, Eddy Zheng}, year={2009}, month={Jul}, pages={48} } @book{liu_zhang_shen_2008, place={Williamsburg, VA}, title={A Cross-Input Adaptive Framework for GPU Program Optimization}, number={WM-CS-2008-09}, institution={Computer Science Department, The College of William and Mary}, author={Liu, Yixun and Zhang, Eddy Z. and Shen, Xipeng}, year={2008} } @inproceedings{jiang_shen_2008, title={Adaptive Software Speculation for Enhancing the Cost-Efficiency of Behavior-Oriented Parallelization}, url={http://dx.doi.org/10.1109/icpp.2008.50}, DOI={10.1109/icpp.2008.50}, abstractNote={Recently, software speculation has shown promising results in parallelizing complex sequential programs by exploiting dynamic high-level parallelism. The speculation however is cost-inefficient. Failed speculations may cause unnecessary shared resource contention, power consumption, and interference to co-running applications. In this work, we propose adaptive speculation and design two algorithms to predict the profitability of a speculation and dynamically disable and enable the speculation of a region. Experimental results demonstrate significant improvement of computation efficiency without performance degradation. The adaptive speculation can also enhance the usability of behavior-oriented parallelization by allowing more flexibility in labeling possibly parallel regions.}, booktitle={2008 37th International Conference on Parallel Processing}, publisher={IEEE}, author={Jiang, Yunlian and Shen, Xipeng}, year={2008}, month={Sep} } @article{jiang_shen_2008, title={Adaptive speculation in behavior-oriented parallelization}, DOI={10.1109/ipdps.2008.4536403}, abstractNote={Behavior-oriented parallelization is a technique for parallelizing complex sequential programs that have dynamic parallelism. Although the technique shows promising results, the software speculation mechanism it uses is not cost-efficient. Failed speculations may waste computing resource and severely degrade system efficiency. In this work, we propose adaptive speculation to predict the profitability of a speculation and dynamically enable or disable the speculation of a region. Experimental results demonstrate the effectiveness of the scheme in improving the efficiency of software speculation. In addition, the adaptive speculation can also enhance the usability of behavior-oriented parallelization by allowing users to label potential parallel regions more flexibly.}, journal={2008 IEEE International Symposium on Parallel and Distributed Processing}, publisher={IEEE}, author={Jiang, Yunlian and Shen, Xipeng}, year={2008}, month={Apr} } @inproceedings{jiang_shen_chen_tripathi_2008, title={Analysis and approximation of optimal co-scheduling on chip multiprocessors}, ISBN={9781605582825}, url={http://dx.doi.org/10.1145/1454115.1454146}, DOI={10.1145/1454115.1454146}, abstractNote={Cache sharing among processors is important for Chip Multiprocessors to reduce inter-thread latency, but also brings cache contention, degrading program performance considerably. Recent studies have shown that job co-scheduling can effectively alleviate the contention, but it remains an open question how to efficiently find optimal co-schedules. Solving the question is critical for determining the potential of a co-scheduling system. This paper presents a theoretical analysis of the complexity of co-scheduling, proving its NP-completeness. Furthermore, for a special case when there are two sharers per chip, we propose an algorithm that finds the optimal co-schedules in polynomial time. For more complex cases, we design and evaluate a sequence of approximation algorithms, among which, the hierarchical matching algorithm produces near-optimal schedules and shows good scalability. This study facilitates the evaluation of co-scheduling systems, as well as offers some techniques directly usable in proactive job co-scheduling.}, booktitle={Proceedings of the 17th international conference on Parallel architectures and compilation techniques - PACT '08}, publisher={ACM Press}, author={Jiang, Yunlian and Shen, Xipeng and Chen, Jie and Tripathi, Rahul}, year={2008} } @book{mao_shen_2008, place={Williamsburg, VA}, title={Cross-Input Learning and Discriminative Prediction in Evolvable Virtual Machines}, number={WM-CS-2008-06}, institution={Computer Science Department, The College of William and Mary}, author={Mao, Feng and Shen, Xipeng}, year={2008} } @inbook{jiang_shen_2008, title={Exploration of the Influence of Program Inputs on CMP Co-scheduling}, ISBN={9783540854500 9783540854517}, ISSN={0302-9743 1611-3349}, url={http://dx.doi.org/10.1007/978-3-540-85451-7_29}, DOI={10.1007/978-3-540-85451-7_29}, abstractNote={Recent studies have showed the effectiveness of job co-scheduling in alleviating shared-cache contention on Chip Multiprocessors. Although program inputs affect cache usage and thus cache contention significantly, their influence on co-scheduling remains unexplored. In this work, we measure that influence and show that the ability to adapt to program inputs is important for a co-scheduler to work effectively on Chip Multiprocessors. We then conduct an exploration in addressing the influence by constructing cross-input predictive models for some memory behaviors that are critical for a recently proposed co-scheduler. The exploration compares the effectiveness of both linear and non-linear regression techniques in the model building. Finally, we conduct a systematic measurement of the sensitivity of co-scheduling on the errors of the predictive behavior models. The results demonstrate the potential of the predictive models in guiding contention-aware co-scheduling.}, booktitle={Lecture Notes in Computer Science}, publisher={Springer Berlin Heidelberg}, author={Jiang, Yunlian and Shen, Xipeng}, year={2008}, month={Aug}, pages={263–273} } @book{mao_shen_2008, title={LU Decomposition on Cell Broadband Engine}, number={WM-CS-2008-08}, institution={Computer Science Department, The College of William and Mary}, author={Mao, Feng and Shen, Xipeng}, year={2008} } @inbook{shen_shaw_2008, title={Scalable Implementation of Efficient Locality Approximation}, ISBN={9783540897392 9783540897408}, ISSN={0302-9743 1611-3349}, url={http://dx.doi.org/10.1007/978-3-540-89740-8_14}, DOI={10.1007/978-3-540-89740-8_14}, abstractNote={As memory hierarchy becomes deeper and shared by more processors, locality increasingly determines system performance. As a rigorous and precise locality model, reuse distance has been used in program optimizations, performance prediction, memory disambiguation, and locality phase prediction. However, the high cost of measurement has been severely impeding its uses in scenarios requiring high efficiency, such as product compilers, performance debugging, run-time optimizations. We recently discovered the statistical connection between time and reuse distance, which led to an efficient way to approximate reuse distance using time. However, not exposed are some algorithmic and implementation techniques that are vital for the efficiency and scalability of the approximation model. This paper presents these techniques. It describes an algorithm that approximates reuse distance on arbitrary scales; it explains a portable scheme that employs memory controller to accelerate the measure of time distance; it uncovers the algorithm and proof of a trace generator that can facilitate various locality studies.}, booktitle={Languages and Compilers for Parallel Computing}, publisher={Springer Berlin Heidelberg}, author={Shen, Xipeng and Shaw, Jonathan}, year={2008}, pages={202–216} } @book{shen_2007, title={A Hybrid Framework Bridging Locality Analysis and Cache-Aware Scheduling for CMPs}, number={WM-CS-2007-01}, institution={Computer Science Dept., The College of William and Mary}, author={Shen, Xipeng}, year={2007}, month={Jan} } @book{shen_jiang_mao_2007, title={CAPS: Contention-Aware Proactive Scheduling for CMPs}, number={WM-CS-2007-09}, institution={Computer Science Department, The College of William and Mary}, author={Shen, Xipeng and Jiang, Yunlian and Mao, Feng}, year={2007} } @inproceedings{shen_shaw_meeker_ding_2007, title={Locality approximation using time}, ISBN={1595935754}, url={http://dx.doi.org/10.1145/1190216.1190227}, DOI={10.1145/1190216.1190227}, abstractNote={Reuse distance (i.e. LRU stack distance) precisely characterizes program locality and has been a basic tool for memory system research since the 1970s. However, the high cost of measuring has restricted its practical uses in performance debugging, locality analysis and optimizations of long-running applications.In this work, we improve the efficiency by exploring the connection between time and locality. We propose a statistical model that converts cheaply obtained time distance to the more costly reuse distance. Compared to the state-of-the-art technique, this approach reduces measuring time by a factor of 17, and approximates cache line reuses with over 99% accuracy and the cache miss rate with less than 0.4% average error for 12 SPEC 2000 integer and floating-point benchmarks. By exploiting the strong correlations between time and locality, this work makes precise locality as easy to obtain as data access frequency, and opens new opportunities for program optimizations.}, booktitle={Proceedings of the 34th annual ACM SIGPLAN-SIGACT symposium on Principles of programming languages - POPL '07}, publisher={ACM Press}, author={Shen, Xipeng and Shaw, Jonathan and Meeker, Brian and Ding, Chen}, year={2007} } @article{zhong_dropsho_shen_studer_ding_2007, title={Miss Rate Prediction Across Program Inputs and Cache Configurations}, volume={56}, ISSN={0018-9340}, url={http://dx.doi.org/10.1109/tc.2007.50}, DOI={10.1109/tc.2007.50}, abstractNote={Improving cache performance requires understanding cache behavior. However, measuring cache performance for one or two data input sets provides little insight into how cache behavior varies across all data input sets and all cache configurations. This paper uses locality analysis to generate a parameterized model of program cache behavior. Given a cache size and associativity, this model predicts the miss rate for arbitrary data input set sizes. This model also identifies critical data input sizes where cache behavior exhibits marked changes. Experiments show this technique is within 2 percent of the hit rate for set associative caches on a set of floating-point and integer programs using array and pointer-based data structures. Building on the new model, this paper presents an interactive visualization tool that uses a three-dimensional plot to show miss rate changes across program data sizes and cache sizes and its use in evaluating compiler transformations. Other uses of this visualization tool include assisting machine and benchmark-set design. The tool can be accessed on the Web at http://www.cs.rochester.edu/research/locality}, number={3}, journal={IEEE Transactions on Computers}, publisher={Institute of Electrical and Electronics Engineers (IEEE)}, author={Zhong, Yutao and Dropsho, Steven G. and Shen, Xipeng and Studer, Ahren and Ding, Chen}, year={2007}, month={Mar}, pages={328–343} } @book{shen_mao_2007, place={Williamsburg, VA}, title={Modeling Relations Between Inputs and Dynamic Behavior for General Programs}, number={WM-CS-2007-07}, institution={Computer Science Department, The College of William and Mary}, author={Shen, Xipeng and Mao, Feng}, year={2007}, month={Jul} } @article{shen_zhong_ding_2007, title={Predicting locality phases for dynamic memory optimization}, volume={67}, ISSN={0743-7315}, url={http://dx.doi.org/10.1016/j.jpdc.2007.01.010}, DOI={10.1016/j.jpdc.2007.01.010}, abstractNote={Dynamic data, cache, and memory adaptation can significantly improve program performance when they are applied on long continuous phases of execution that have dynamic but predictable locality. To support phase-based adaptation, this paper defines the concept of locality phases and describes a four-component analysis technique. Locality-based phase detection uses locality analysis and signal processing techniques to identify phases from the data access trace of a program; frequency-based phase marking inserts code markers that mark phases in all executions of the program; phase hierarchy construction identifies the structure of multiple phases; and phase-sequence prediction predicts the phase sequence from program input parameters. The paper shows the accuracy and the granularity of phase and phase-sequence prediction as well as its uses in dynamic data packing, memory remapping, and cache resizing.}, number={7}, journal={Journal of Parallel and Distributed Computing}, publisher={Elsevier BV}, author={Shen, Xipeng and Zhong, Yutao and Ding, Chen}, year={2007}, month={Jul}, pages={783–796} } @inproceedings{ding_shen_kelsey_tice_huang_zhang_2007, title={Software behavior oriented parallelization}, ISBN={9781595936332}, url={http://dx.doi.org/10.1145/1250734.1250760}, DOI={10.1145/1250734.1250760}, abstractNote={Many sequential applications are difficult to parallelize because of unpredictable control flow, indirect data access, and input-dependent parallelism. These difficulties led us to build a software system for behavior oriented parallelization (BOP), which allows a program to be parallelized based on partial information about program behavior, for example, a user reading just part of the source code, or a profiling tool examining merely one or few executions. The basis of BOP is programmable software speculation, where a user or an analysis tool marks possibly parallel regions in the code, and the run-time system executes these regions speculatively. It is imperative to protect the entire address space during speculation. The main goal of the paper is to demonstrate that the general protection can be made cost effective by three novel techniques: programmable speculation, critical-path minimization, and value-based correctness checking. On a recently acquired multi-core, multi-processor PC, the BOP system reduced the end-to-end execution time by integer factors for a Lisp interpreter, a data compressor, a language parser, and a scientific library, with no change to the underlying hardware or operating system.}, booktitle={Proceedings of the 2007 ACM SIGPLAN conference on Programming language design and implementation - PLDI '07}, publisher={ACM Press}, author={Ding, Chen and Shen, Xipeng and Kelsey, Kirk and Tice, Chris and Huang, Ruke and Zhang, Chengliang}, year={2007} } @book{jiang_shen_2007, title={Study of the Effects of Program Inputs on Co-Scheduling}, number={WM-CS-2007-13}, institution={Computer Science Department, The College of William and Mary}, author={Jiang, Yunlian and Shen, Xipeng}, year={2007}, month={Jul} } @book{bai_shen_zhang_scherer_ding_scott_2006, place={Rochester, NY}, title={A Key-Based Adaptive Transactional Memory Executor}, number={TR909}, institution={Computer Science Dept., University of Rochester}, author={Bai, Tongxin and Shen, Xipeng and Zhang, Chengliang and Scherer, William N. and Ding, Chen and Scott, Michael L.}, year={2006}, month={Dec} } @book{shen_shaw_meeker_2006, title={Accurate Approximation of Locality from Time Distance Histograms}, number={TR902}, institution={Computer Science Dept., University of Rochester}, author={Shen, Xipeng and Shaw, Jonathan and Meeker, Brian}, year={2006}, month={Jul} } @book{parallelization”_ding_shen_kelsey_tice_huang_zhang_2006, title={Behavior-Oriented Parallelization}, number={TR904}, institution={Computer Science Dept., University of Rochester}, author={Parallelization”, Behavior-Oriented and Ding, Chen and Shen, Xipeng and Kelsey, Kirk and Tice, Chris and Huang, Ruke and Zhang, Chengliang}, year={2006}, month={Nov} } @book{shen_shaw_meeker_ding_2006, title={Locality Approximation Using Time}, number={TR901}, institution={Computer Science Dept., University of Rochester}, author={Shen, Xipeng and Shaw, Jonathan and Meeker, Brian and Ding, Chen}, year={2006}, month={Jul} } @inproceedings{zhang_kelsey_shen_ding_hertz_ogihara_2006, title={Program-level adaptive memory management}, ISBN={1595932216}, url={http://dx.doi.org/10.1145/1133956.1133979}, DOI={10.1145/1133956.1133979}, abstractNote={Most application's performance is impacted by the amount of available memory. In a traditional application, which has a fixed working set size, increasing memory has a beneficial effect up until the application's working set is met. In the presence of garbage collection this relationship becomes more complex. While increasing the size of the program's heap reduces the frequency of collections, collecting a heap with memory paged to the backing store is very expensive. We first demonstrate the presence of an optimal heap size for a number of applications running on a machine with a specific configuration. We then introduce a scheme which adaptively finds this good heap size. In this scheme, we track the memory usage and number of page faults at a program's phase boundaries. Using this information, the system selects the soft heap size. By adapting itself dynamically, our scheme is independent of the underlying main memory size, code optimizations, and garbage collection algorithm. We present several experiments on real applications to show the effectiveness of our approach. Our results show that program-level heap control provides up to a factor of 7.8 overall speedup versus using the best possible fixed heap size controlled by the virtual machine on identical garbage collectors.}, booktitle={Proceedings of the 2006 international symposium on Memory management - ISMM '06}, publisher={ACM Press}, author={Zhang, Chengliang and Kelsey, Kirk and Shen, Xipeng and Ding, Chen and Hertz, Matthew and Ogihara, Mitsunori}, year={2006} } @book{zhang_kelsey_shen_ding_hertz_ogihara_2006, title={Waste Not, Want Not: Adaptive Garbage Collection in a Shared Environment}, number={TR908}, institution={Computer Science Dept., University of Rochester}, author={Zhang, Chengliang and Kelsey, Kirk and Shen, Xipeng and Ding, Chen and Hertz, Matthew and Ogihara, Mitsu}, year={2006}, month={Dec} } @inproceedings{ding_zhang_shen_ogihara_2005, title={Gated memory control for memory monitoring, leak detection and garbage collection}, ISBN={1595931473}, url={http://dx.doi.org/10.1145/1111583.1111593}, DOI={10.1145/1111583.1111593}, abstractNote={In the past, program monitoring often operates at the code level, performing checks at function and loop boundaries. Recent research shows that profiling analysis can identify high-level phases in complex binary code. Examples are time steps in scientific simulations and service cycles in utility programs. Because of their larger size and more predictable behavior, program phases make it possible for more accurate and longer term predictions of program behavior, especially its memory usage. This paper describes a new approach that uses phase boundaries as the gates to monitor and control the memory usage. In particular, it presents three techniques: memory usage monitoring, object lifetime classification, and preventive memory management. They use phase-level patterns to predict the trend of the program's memory demand, identify and control memory leaks, improve the efficiency of garbage collection. The potential of the new techniques is demonstrated on two non-trivial applications---a C compiler and a Lisp interpreter.}, booktitle={Proceedings of the 2005 workshop on Memory system performance - MSP '05}, publisher={ACM Press}, author={Ding, Chen and Zhang, Chengliang and Shen, Xipeng and Ogihara, Mitsunori}, year={2005} } @inproceedings{shen_gao_ding_archambault_2005, title={Lightweight reference affinity analysis}, ISBN={1595931678}, url={http://dx.doi.org/10.1145/1088149.1088167}, DOI={10.1145/1088149.1088167}, abstractNote={Previous studies have shown that array regrouping and structure splitting significantly improve data locality. The most effective technique relies on profiling every access to every data element. The high overhead impedes its adoption in a general compiler, In this paper, we show that for array regrouping in scientific programs, the overhead is not needed since the same benefit can be obtained by pure program analysis.We present an interprocedural analysis technique for array regrouping. For each global array, the analysis summarizes the access pattern by access-frequency vectors and then groups arrays with similar vectors. The analysis is context sensitive, so it tracks the exact array access. For each loop or function call, it uses two methods to estimate the frequency of the execution. The first is symbolic analysis in the compiler. The second is lightweight profiling of the code. The same interprocedural analysis is used to cumulate the overall execution frequency by considering the calling context. We implemented a prototype of both the compiler and the profiling analysis in the IBM® compiler, evaluated array regrouping on the entire set of SPEC CPU2000 FORTRAN benchmarks, and compared different analysis methods. The pure compiler-based array regrouping improves the performance for the majority of programs, leaving little room for improvement by code or data profiling.}, booktitle={Proceedings of the 19th annual international conference on Supercomputing - ICS '05}, publisher={ACM Press}, author={Shen, Xipen and Gao, Yaoqing and Ding, Chen and Archambault, Roch}, year={2005} } @book{shen_ding_2005, place={Rochester, NY}, title={Parallelization of Utility Programs Based on Behavior Phase Analysis}, number={TR876}, institution={Computer Science Dept., University of Rochester}, author={Shen, Xipeng and Ding, Chen}, year={2005}, month={Mar} } @inbook{shen_zhong_ding_2005, title={Phase-Based Miss Rate Prediction Across Program Inputs}, ISBN={9783540280095 9783540318132}, ISSN={0302-9743 1611-3349}, url={http://dx.doi.org/10.1007/11532378_5}, DOI={10.1007/11532378_5}, abstractNote={Previous work shows the possibility of predicting the cache miss rate (CMR) for all inputs of a program. However, most optimization techniques need to know more than the miss rate of the whole program. Many of them benefit from knowing miss rate of each execution phase of a program for all inputs. In this paper, we describe a method that divides a program into phases that have a regular locality pattern. Using a regression model, it predicts the reuse signature and then the cache miss rate of each phase for all inputs. We compare the prediction with the actual measurement. The average prediction is over 98% accurate for a set of floating-point programs. The predicted CMR-traces matches the simulated ones in spite of dramatic fluctuations of the miss rate over time. This technique can be used for improving dynamic optimization, benchmarking, and compiler design.}, booktitle={Lecture Notes in Computer Science}, publisher={Springer Berlin Heidelberg}, author={Shen, Xipeng and Zhong, Yutao and Ding, Chen}, year={2005}, pages={42–55} } @inbook{zhong_shen_ding_2004, title={A Hierarchical Model of Reference Affinity}, ISBN={9783540211990 9783540246442}, ISSN={0302-9743 1611-3349}, url={http://dx.doi.org/10.1007/978-3-540-24644-2_4}, DOI={10.1007/978-3-540-24644-2_4}, abstractNote={To improve performance, data reorganization needs locality models to identify groups of data that have reference affinity. Much past work is based on access frequency and does not consider accessing time directly. In this paper, we propose a new model of reference affinity. This model considers the distance between data accesses in addition to the frequency. Affinity groups defined by this model are consistent and have a hierarchical structure. The former property ensures the profitability of data packing, while the latter supports data packing for storage units of different sizes. We then present a statistical clustering method that identifies affinity groups among structure fields and data arrays by analyzing training runs of a program. When used by structure splitting and array regrouping, the new method improves the performance of two test programs by up to 31%. The new data layout is significantly better than that produced by the programmer or by static compiler analysis.}, booktitle={Languages and Compilers for Parallel Computing}, publisher={Springer Berlin Heidelberg}, author={Zhong, Yutao and Shen, Xipeng and Ding, Chen}, year={2004}, pages={48–63} } @inproceedings{shen_ding_2004, title={Adaptive data partition for sorting using probability distribution}, ISBN={0769521975}, url={http://dx.doi.org/10.1109/icpp.2004.1327928}, DOI={10.1109/icpp.2004.1327928}, abstractNote={Many computing problems benefit from dynamic partition of data into smaller chunks with better parallelism and locality. However, it is difficult to partition all types of inputs with the same high efficiency. This paper presents a new partition method in sorting scenario based on probability distribution, an idea first studied by Janus and Lamagna in early 1980's on a mainframe computer. The new technique makes three improvements. The first is a rigorous sampling technique that ensures accurate estimate of the probability distribution. The second is an efficient implementation on modern, cache-based machines. The last is the use of probability distribution in parallel sorting. Experiments show 10-30% improvement in partition balance and 20-70% reduction in partition overhead, compared to two commonly used techniques. The new method reduces the parallel sorting time by 33-50% and outperforms the previous fastest sequential sorting technique by up to 30%.}, booktitle={International Conference on Parallel Processing, 2004. ICPP 2004.}, publisher={IEEE}, author={Shen, X. and Ding, C.}, year={2004} } @inproceedings{zhong_orlovich_shen_ding_2004, place={New York}, title={Array regrouping and structure splitting using whole-program reference affinity}, ISBN={1581138075}, url={http://dx.doi.org/10.1145/996841.996872}, DOI={10.1145/996841.996872}, abstractNote={While the memory of most machines is organized as a hierarchy, program data are laid out in a uniform address space. This paper defines a model of reference affinity, which measures how close a group of data are accessed together in a reference trace. It proves that the model gives a hierarchical partition of program data. At the top is the set of all data with the weakest affinity. At the bottom is each data element with the strongest affinity. Based on the theoretical model, the paper presents k-distance analysis, a practical test for the hierarchical affinity of source-level data. When used for array regrouping and structure splitting, k-distance analysis consistently outperforms data organizations given by the programmer, compiler analysis, frequency profiling, statistical clustering, and all other methods we have tried.}, booktitle={Proceedings of the ACM SIGPLAN 2004 conference on Programming language design and implementation - PLDI '04}, publisher={ACM Press}, author={Zhong, Yutao and Orlovich, Maksim and Shen, Xipeng and Ding, Chen}, year={2004}, pages={255} } @book{shen_ding_dwarkdas_scott_2004, title={Characterizing Phases in Service-Oriented Applications}, number={TR848}, institution={Computer Science Dept., University of Rochester}, author={Shen, Xipeng and Ding, Chen and Dwarkdas, Sandhya and Scott, Michael L.}, year={2004}, month={Nov} } @article{boutell_luo_shen_brown_2004, title={Learning multi-label scene classification}, volume={37}, ISSN={0031-3203}, url={http://dx.doi.org/10.1016/j.patcog.2004.03.009}, DOI={10.1016/j.patcog.2004.03.009}, abstractNote={In classic pattern recognition problems, classes are mutually exclusive by definition. Classification errors occur when the classes overlap in the feature space. We examine a different situation, occurring when the classes are, by definition, not mutually exclusive. Such problems arise in semantic scene and document classification and in medical diagnosis. We present a framework to handle such problems and apply it to the problem of semantic scene classification, where a natural scene may contain multiple objects such that the scene can be described by multiple class labels (e.g., a field scene with a mountain in the background). Such a problem poses challenges to the classic pattern recognition paradigm and demands a different treatment. We discuss approaches for training and testing in this scenario and introduce new metrics for evaluating individual examples, class recall and precision, and overall accuracy. Experiments show that our methods are suitable for scene classification; furthermore, our work appears to generalize to other classification problems of the same nature.}, number={9}, journal={Pattern Recognition}, publisher={Elsevier BV}, author={Boutell, Matthew R. and Luo, Jiebo and Shen, Xipeng and Brown, Christopher M.}, year={2004}, month={Sep}, pages={1757–1771} } @inproceedings{shen_zhong_ding_2004, title={Locality phase prediction}, ISBN={1581138040}, url={http://dx.doi.org/10.1145/1024393.1024414}, DOI={10.1145/1024393.1024414}, abstractNote={As computer memory hierarchy becomes adaptive, its performance increasingly depends on forecasting the dynamic program locality. This paper presents a method that predicts the locality phases of a program by a combination of locality profiling and run-time prediction. By profiling a training input, it identifies locality phases by sifting through all accesses to all data elements using variable-distance sampling, wavelet filtering, and optimal phase partitioning. It then constructs a phase hierarchy through grammar compression. Finally, it inserts phase markers into the program using binary rewriting. When the instrumented program runs, it uses the first few executions of a phase to predict all its later executions.Compared with existing methods based on program code and execution intervals, locality phase prediction is unique because it uses locality profiles, and it marks phase boundaries in program code. The second half of the paper presents a comprehensive evaluation. It measures the accuracy and the coverage of the new technique and compares it with best known run-time methods. It measures its benefit in adaptive cache resizing and memory remapping. Finally, it compares the automatic analysis with manual phase marking. The results show that locality phase prediction is well suited for identifying large, recurring phases in complex programs.}, booktitle={Proceedings of the 11th international conference on Architectural support for programming languages and operating systems - ASPLOS-XI}, publisher={ACM Press}, author={Shen, Xipeng and Zhong, Yutao and Ding, Chen}, year={2004} } @inproceedings{shen_boutell_luo_brown_2004, place={San Jose, California, USA}, title={Multi-label Machine Learning and Its Application to Semantic Scene Classification}, volume={5307}, DOI={10.1117/12.523428}, abstractNote={In classic pattern recognition problems, classes are mutually exclusive by definition. Classification errors occur when the classes overlap in the feature space. We examine a different situation, occurring when the classes are, by definition, not mutually exclusive. Such problems arise in scene and document classification and in medical diagnosis. We present a framework to handle such problems and apply it to the problem of semantic scene classification, where a natural scene may contain multiple objects such that the scene can be described by multiple class labels (e.g., a field scene with a mountain in the background). Such a problem poses challenges to the classic pattern recognition paradigm and demands a different treatment. We discuss approaches for training and testing in this scenario and introduce new metrics for evaluating individual examples, class recall and precision, and overall accuracy. Experiments show that our methods are suitable for scene classification; furthermore, our work appears to generalize to other classification problems of the same nature.}, booktitle={Proceedings of Storage and Retrieval Methods and Applications for Multimedia 2004}, publisher={SPIE}, author={Shen, Xipeng and Boutell, Matthew and Luo, Jiebo and Brown, Christopher}, year={2004}, month={Jan}, pages={188–199} } @book{shen_zhong_ding_2003, title={Adaptive Data Partitioning using Probability Distribution}, number={TR823}, institution={Computer Science Dept., University of Rochester}, author={Shen, Xipeng and Zhong, Yutao and Ding, Chen}, year={2003}, month={Dec} } @book{boutell_shen_luo_brown_2003, title={Multi-label Semantic Scene Classification}, number={TR813}, institution={Dept. of Computer Science, University of Rochester}, author={Boutell, Matthew and Shen, Xipeng and Luo, Jiebo and Brown, Christopher}, year={2003}, month={Sep} } @book{shen_zhong_ding_2003, title={Predicting Hierarchical Phases in Program Data Behavior}, number={TR824}, institution={Computer Science Dept., University of Rochester}, author={Shen, Xipeng and Zhong, Yutao and Ding, Chen}, year={2003}, month={Dec} } @inproceedings{shen_zhong_ding_2003, place={Sante Fe, New Mexico, USA}, title={Regression-Based Multi-Model Prediction of Data Reuse Signature}, booktitle={Proceedings of the Fourth Annual Symposium of the Los Alamos Computer Science Institute}, publisher={Alamos Computer Science Institute}, author={Shen, Xipeng and Zhong, Yutao and Ding, Chen}, year={2003}, month={Oct}, pages={243–251} } @book{ferguson_allen_blaylock_byron_chambers_dzikovska_galescu_shen_swier_swift_2002, title={The Medication Advisor Project: Preliminary Report}, number={776}, institution={Dept. of Computer Science, University of Rochester}, author={Ferguson, George and Allen, James and Blaylock, Nate and Byron, Donna and Chambers, Nate and Dzikovska, Myroslava and Galescu, Lucian and Shen, Xipeng and Swier, Robert and Swift, Mary}, year={2002}, month={May} } @inproceedings{shen_xu_2001, place={Aalborg, Denmark}, title={Study and Auto-Detection of Stress Based on Tonal Pitch Range in Mandarin}, booktitle={Proceedings of Seventh European Conference on Speech Communication and Technology}, author={Shen, Xipeng and Xu, Bo}, year={2001}, month={Sep}, pages={123–126} } @inproceedings{shen_xu_2001, place={Aalborg, Denmark}, title={The Study Of The Effect Of Training Set On Statistical Language Modeling}, booktitle={Proceedings of Seventh European Conference on Speech Communication and Technology}, author={Shen, Xipeng and Xu, Bo}, year={2001}, month={Sep}, pages={721–724} } @inproceedings{shen_xu_2000, place={Beijing, China}, title={A CART-Based Hierarchical Stochastic Model for Prosodic Phrasing in Chinese}, booktitle={Proceedings of International Symposium on Chinese Spoken Language Processing 2000}, author={Shen, Xipeng and Xu, Bo}, year={2000}, month={Oct}, pages={105–108} }