@article{chen_sung_shen_tallent_barker_li_2023, title={Accelerating matrix-centric graph processing on GPUs through bit-level optimizations}, volume={177}, ISSN={["1096-0848"]}, DOI={10.1016/j.jpdc.2023.02.013}, abstractNote={Even though it is well known that binary values are common in graph applications (e.g., adjacency matrix), how to leverage the phenomenon for efficiency has not yet been adequately explored. This paper presents a systematic study on how to unlock the potential of the bit-level optimizations of graph computations that involve binary values. It proposes a two-level representation named Bit-Block Compressed Sparse Row (B2SR) and presents a series of optimizations to the graph operations on B2SR by the intrinsics of modern GPUs. It additionally introduces Deep Reinforcement Learning (DRL) as an efficient way to best configure the bit-level optimizations on the fly. The DQN-based adaptive tile size selector with dedicated model training can reach 68% prediction accuracy. Evaluations on the NVIDIA Pascal and Volta GPUs show that the optimizations bring up to 40 and 6555 for essential GraphBLAS kernels SpMV and SpGEMM, respectively, accelerating GraphBLAS-based BFS by up to 433, SSSP, PR, and CC 35, and TC 52.}, journal={JOURNAL OF PARALLEL AND DISTRIBUTED COMPUTING}, author={Chen, Jou-An and Sung, Hsin-Hsuan and Shen, Xipeng and Tallent, Nathan and Barker, Kevin and Li, Ang}, year={2023}, month={Jul}, pages={53–67} } @article{zhang_mariano_shen_dillig_2023, title={Automated Translation of Functional Big Data similar to eries to SQL}, volume={7}, ISSN={["2475-1421"]}, url={https://doi.org/10.1145/3586047}, DOI={10.1145/3586047}, abstractNote={Big data analytics frameworks like Apache Spark and Flink enable users to implement queries over large, distributed databases using functional APIs. In recent years, these APIs have grown in popularity because their functional interfaces abstract away much of the minutiae of distributed programming required by traditional query languages like SQL. However, the convenience of these APIs comes at a cost because functional queries are often less efficient than their SQL counterparts. Motivated by this observation, we present a new technique for automatically transpiling functional queries to SQL. While our approach is based on the standard paradigm of counterexample-guided inductive synthesis, it uses a novel column-wise decomposition technique to split the synthesis task into smaller subquery synthesis problems. We have implemented this approach as a new tool called RDD2SQL for translating Spark RDD queries to SQL and empirically evaluate the effectiveness of RDD2SQL on a set of real-world RDD queries. Our results show that (1) most RDD queries can be translated to SQL, (2) our tool is very effective at automating this translation, and (3) performing this translation offers significant performance benefits.}, number={OOPSLA}, journal={PROCEEDINGS OF THE ACM ON PROGRAMMING LANGUAGES-PACMPL}, author={Zhang, Guoqiang and Mariano, Benjamin and Shen, Xipeng and Dillig, Isil}, year={2023}, month={Apr} } @article{ye_xu_shen_sha_liao_jin_solihin_2023, title={Reconciling Selective Logging and Hardware Persistent Memory Transaction}, ISSN={["1530-0897"]}, DOI={10.1109/HPCA56546.2023.10071088}, abstractNote={Log creation, maintenance, and its persist ordering are known to be performance bottlenecks for durable transactions on persistent memory. Existing hardware persistent memory transactions overlook an important opportunity for improving performance: some persistent data is algorithmically redundant such that it can be recovered from other data, removing the need for logging such data. The paper presents an ISA extension that enables selective logging for hardware persistent memory transactions for the first time. The ISA extension features two novel components: fine-grain logging and lazy persistency. Fine-grain logging allows hardware to log updates on data in the granularity of words without lengthening the critical path of data accesses. Lazy persistency allows updated data to remain in the cache after the transaction commits. Together, the new hardware persistent memory transaction outperforms the state-of-the-art hardware counterpart by 1.8× on average.}, journal={2023 IEEE INTERNATIONAL SYMPOSIUM ON HIGH-PERFORMANCE COMPUTER ARCHITECTURE, HPCA}, author={Ye, Chencheng and Xu, Yuanchao and Shen, Xipeng and Sha, Yan and Liao, Xiaofei and Jin, Hai and Solihin, Yan}, year={2023}, pages={664–676} } @article{chen_niu_ren_wang_shen_2023, title={Survey: Exploiting Data Redundancy for Optimization of Deep Learning}, volume={55}, ISSN={["1557-7341"]}, DOI={10.1145/3564663}, abstractNote={Data redundancy is ubiquitous in the inputs and intermediate results of Deep Neural Networks (DNN) . It offers many significant opportunities for improving DNN performance and efficiency and has been explored in a large body of work. These studies have scattered in many venues across several years. The targets they focus on range from images to videos and texts, and the techniques they use to detect and exploit data redundancy also vary in many aspects. There is not yet a systematic examination and summary of the many efforts, making it difficult for researchers to get a comprehensive view of the prior work, the state of the art, differences and shared principles, and the areas and directions yet to explore. This article tries to fill the void. It surveys hundreds of recent papers on the topic, introduces a novel taxonomy to put the various techniques into a single categorization framework, offers a comprehensive description of the main methods used for exploiting data redundancy in improving multiple kinds of DNNs on data, and points out a set of research opportunities for future exploration.}, number={10}, journal={ACM COMPUTING SURVEYS}, author={Chen, Jou-An and Niu, Wei and Ren, Bin and Wang, Yanzhi and Shen, Xipeng}, year={2023}, month={Oct} } @article{chen_sung_shen_tallent_barker_li_2022, title={Bit-GraphBLAS: Bit-Level Optimizations of Matrix-Centric Graph Processing on GPU}, ISSN={["1530-2075"]}, DOI={10.1109/IPDPS53621.2022.00056}, abstractNote={In a general graph data structure like an adjacency matrix, when edges are homogeneous, the connectivity of two nodes can be sufficiently represented using a single bit. This insight has, however, not yet been adequately exploited by the existing matrix-centric graph processing frameworks. This work fills the void by systematically exploring the bit-level representation of graphs and the corresponding optimizations to the graph operations. It proposes a two-level representation named Bit-Block Compressed Sparse Row (B2SR) and presents a series of optimizations to the graph operations on B2SR by leveraging the intrinsics of modern GPUs. Evaluations on NVIDIA Pascal and Volta GPUs show that the optimizations bring up to 40× and 6555× for essential GraphBLAS kernels SpMV and SpGEMM, respectively, making GraphBLAS-based BFS accelerate up to 433×, SSSP, PR, and CC up to 35×, and TC up to 52×.}, journal={2022 IEEE 36TH INTERNATIONAL PARALLEL AND DISTRIBUTED PROCESSING SYMPOSIUM (IPDPS 2022)}, author={Chen, Jou-An and Sung, Hsin-Hsuan and Shen, Xipeng and Tallent, Nathan and Barker, Kevin and Li, Ang}, year={2022}, pages={515–525} } @article{sung_xu_guan_niu_ren_wang_liu_shen_2022, title={Brief Industry Paper: Enabling Level-4 Autonomous Driving on a Single $1k Off-the-Shelf Card}, ISSN={["1545-3421"]}, DOI={10.1109/RTAS54340.2022.00032}, abstractNote={In the past few years we have developed hardware computing systems for commercial autonomous vehicles, but inevitably the high development cost and long turn-around time have been major roadblocks for commercial deployment. Hence we also explored the potential of software optimization. This paper, for the first-time, shows that it is feasible to enable full leve1-4 autonomous driving workloads on a single off-the-shelf card (Jetson AGX Xavier) for less than ${\$}1\mathrm{k}$, an order of magnitude less than the state-of-the-art systems, while meeting all the requirements of latency. The success comes from the resolution of some important issues shared by existing practices through a series of measures and innovations.}, journal={2022 IEEE 28TH REAL-TIME AND EMBEDDED TECHNOLOGY AND APPLICATIONS SYMPOSIUM (RTAS)}, author={Sung, Hsin-Hsuan and Xu, Yuanchao and Guan, Jiexiong and Niu, Wei and Ren, Bin and Wang, Yanzhi and Liu, Shaoshan and Shen, Xipeng}, year={2022}, pages={297–300} } @article{wu_zhang_guan_zheng_du_shen_2022, title={DREW: Efficient Winograd CNN Inference with Deep Reuse}, DOI={10.1145/3485447.3511985}, abstractNote={Deep learning has been used in various domains, including Web services. Convolutional neural networks (CNNs), which are deep learning representatives, are among the most popular neural networks in Web systems. However, CNN employs a high degree of computing. In comparison to the training phase, the inference process is more frequently done on low-power computing equipments. The limited computing resource and high computation pressure limit the effective use of CNN algorithms in industry. Fortunately, a minimal filtering algorithm called Winograd can reduce convolution calculations by minimizing multiplication operations. We find that Winograd convolution can be sped up further by deep reuse technique, which reuses the similar data and computation processes. In this paper, we propose a new inference method, called DREW, which combines deep reuse with Winograd for further accelerating CNNs. DREW handles three difficulties. First, it can detect the similarities from the complex minimal filtering patterns by clustering. Second, it reduces the online clustering cost in a reasonable range. Third, it provides an adjustable method in clustering granularity balancing the performance and accuracy. Experiments show that 1) DREW further accelerates the Winograd convolution by an average of 2.06 × speedup; 2) when DREW is applied to end-to-end Winograd CNN inference, it achieves 1.71 × the average performance speedup with no (<0.4%) accuracy loss; 3) DREW reduces the number of convolution operations to 11% of the original operations on average.}, journal={PROCEEDINGS OF THE ACM WEB CONFERENCE 2022 (WWW'22)}, author={Wu, Ruofan and Zhang, Feng and Guan, Jiawei and Zheng, Zhen and Du, Xiaoyong and Shen, Xipeng}, year={2022}, pages={1807–1816} } @article{cicek_shen_ozturk_2022, title={Energy Efficient Boosting of GEMM Accelerators for DNN via Reuse}, volume={27}, ISSN={["1557-7309"]}, DOI={10.1145/3503469}, abstractNote={Reuse-centric convolutional neural networks (CNN) acceleration speeds up CNN inference by reusing computations for similar neuron vectors in CNN’s input layer or activation maps. This new paradigm of optimizations is, however, largely limited by the overheads in neuron vector similarity detection, an important step in reuse-centric CNN. This article presents an in-depth exploration of architectural support for reuse-centric CNN. It addresses some major limitations of the state-of-the-art design and proposes a novel hardware accelerator that improves neuron vector similarity detection and reduces the energy consumption of reuse-centric CNN inference. The accelerator is implemented to support a wide variety of neural network settings with a banked memory subsystem. Design exploration is performed through RTL simulation and synthesis on an FPGA platform. When integrated into Eyeriss, the accelerator can potentially provide improvements up to 7.75 \( \times \) in performance. Furthermore, it can reduce the energy used for similarity detection up to 95.46%, and it can accelerate the convolutional layer up to 3.63 \( \times \) compared to the software-based implementation running on the CPU.}, number={5}, journal={ACM TRANSACTIONS ON DESIGN AUTOMATION OF ELECTRONIC SYSTEMS}, author={Cicek, Nihat Mert and Shen, Xipeng and Ozturk, Ozcan}, year={2022}, month={Sep} } @article{pan_zhang_zhou_zhai_shen_mutlu_du_2022, title={Exploring Data Analytics Without Decompression on Embedded GPU Systems}, volume={33}, ISSN={["1558-2183"]}, url={https://doi.org/10.1109/TPDS.2021.3119402}, DOI={10.1109/TPDS.2021.3119402}, abstractNote={With the development of computer architecture, even for embedded systems, GPU devices can be integrated, providing outstanding performance and energy efficiency to meet the requirements of different industries, applications, and deployment environments. Data analytics is an important application scenario for embedded systems. Unfortunately, due to the limitation of the capacity of the embedded device, the scale of problems handled by the embedded system is limited. In this paper, we propose a novel data analytics method, called G-TADOC, for efficient text analytics directly on compression on embedded GPU systems. A large amount of data can be compressed and stored in embedded systems, and can be processed directly in the compressed state, which greatly enhances the processing capabilities of the systems. Particularly, G-TADOC has three innovations. First, a novel fine-grained thread-level workload scheduling strategy for GPU threads has been developed, which partitions heavily-dependent loads adaptively in a fine-grained manner. Second, a GPU thread-safe memory pool has been developed to handle inconsistency with low synchronization overheads. Third, a sequence-support strategy is provided to maintain high GPU parallelism while ensuring sequence information for lossless compression. Moreover, G-TADOC involves special optimizations for embedded GPUs, such as utilizing the CPU-GPU shared unified memory. Experiments show that G-TADOC provides 13.2× average speedup compared to the state-of-the-art TADOC. G-TADOC also improves performance-per-cost by 2.6× and energy efficiency by 32.5× over TADOC.}, number={7}, journal={IEEE TRANSACTIONS ON PARALLEL AND DISTRIBUTED SYSTEMS}, publisher={Institute of Electrical and Electronics Engineers (IEEE)}, author={Pan, Zaifeng and Zhang, Feng and Zhou, Yanliang and Zhai, Jidong and Shen, Xipeng and Mutlu, Onur and Du, Xiaoyong}, year={2022}, month={Jul}, pages={1553–1568} } @article{xu_ye_solihin_shen_2022, title={FFCCD: Fence-Free Crash-Consistent Concurrent Defragmentation for Persistent Memory}, ISSN={["1063-6897"]}, DOI={10.1145/3470496.3527406}, abstractNote={Persistent Memory (PM) is increasingly supplementing or substituting DRAM as main memory. Prior work have focused on reusability and memory leaks of persistent memory but have not addressed a problem amplified by persistence, persistent memory fragmentation, which refers to the continuous worsening of fragmentation of persistent memory throughout its usage. This paper reveals the challenges and proposes the first systematic crash-consistent solution, Fence-Free Crash-consistent Concurrent Defragmentation (FFCCD). FFCCD resues persistent pointer format, root nodes and typed allocation provided by persistent memory programming model to enable concurrent defragmentation on PM. FFCCD introduces architecture support for concurrent defragmentation that enables a fence-free design and fast read barrier, reducing two major overheads of defragmenting persistent memory. The techniques is effective (28--73% fragmentation reduction) and fast (4.1% execution time overhead).}, journal={PROCEEDINGS OF THE 2022 THE 49TH ANNUAL INTERNATIONAL SYMPOSIUM ON COMPUTER ARCHITECTURE (ISCA '22)}, author={Xu, Yuanchao and Ye, Chencheng and Solihin, Yan and Shen, Xipeng}, year={2022}, pages={274–288} } @article{niu_guan_shen_wang_agrawal_ren_2022, title={GCD(2) : A Globally Optimizing Compiler for Mapping DNNs to Mobile DSPs}, ISSN={["1072-4451"]}, DOI={10.1109/MICRO56248.2022.00044}, abstractNote={More specialized chips are exploiting available high transistor density to expose parallelism at a large scale with more intricate instruction sets. This paper reports on a compilation system GCD 2 , developed to support complex Deep Neural Network (DNN) workloads on mobile DSP chips. We observe several challenges in fully exploiting this architecture, related to SIMD width, more complex SIMD/vector instructions, and VLIW pipeline with the notion of soft dependencies. GCD 2 comprises the following contributions: 1) development of matrix layout formats that support the use of different novel SIMD instructions, 2) formulation and solution of a global optimization problem related to choosing the best instruction (and associated layout) for implementation of each operator in a complete DNN, and 3) SDA, an algorithm for packing instructions with consideration for soft dependencies. These solutions are incorporated in a complete compilation system that is extensively evaluated against other systems using 10 large DNN models. Evaluation results show that GCD 2 outperforms two product-level state-of-the-art end-to-end DNN execution frameworks (TFLite and Qualcomm SNPE) that support mobile DSPs by up to $ 6.0 \times$ speedup, and outperforms three established compilers (Halide, TVM, and RAKE) by up to $4.5 \times, 3.4 \times$ and $4.0 \times$ speedup, respectively. GCD 2 is also unique in supporting, real-time execution of certain DNNs, while its implementation enables two major DNNs to execute on a mobile DSP for the first time.}, journal={2022 55TH ANNUAL IEEE/ACM INTERNATIONAL SYMPOSIUM ON MICROARCHITECTURE (MICRO)}, author={Niu, Wei and Guan, Jiexiong and Shen, Xipeng and Wang, Yanzhi and Agrawal, Gagan and Ren, Bin}, year={2022}, pages={512–529} } @article{cicek_ning_ozturk_shen_2022, title={General Reuse-Centric CNN Accelerator}, volume={71}, ISSN={["1557-9956"]}, url={https://doi.org/10.1109/TC.2021.3064608}, DOI={10.1109/TC.2021.3064608}, abstractNote={This article introduces the first general reuse-centric accelerator for CNN inferences. Unlike prior work that exploits similarities only across consecutive video frames, general reuse-centric accelerator is able to discover similarities among arbitrary patches within an image or across independent images, and translate them into computation time and energy savings. Experiments show that the accelerator complements both prior software-based CNN and various CNN hardware accelerators, producing up to 14.96X speedups for similarity discovery, up to 2.70X speedups for overall inference.}, number={4}, journal={IEEE TRANSACTIONS ON COMPUTERS}, publisher={Institute of Electrical and Electronics Engineers (IEEE)}, author={Cicek, Nihat Mert and Ning, Lin and Ozturk, Ozcan and Shen, Xipeng}, year={2022}, month={Apr}, pages={880–891} } @article{young_nan_shen_2022, title={IDE Augmented with Human-Learning Inspired Natural Language Programming}, ISSN={["2574-1926"]}, DOI={10.1145/3510454.3516832}, abstractNote={Natural Language (NL) programming, the concept of synthesizing code from natural language inputs, has garnered growing interest among the software community in recent years. Unfortunately, current solutions in the space all suffer from the same problem, they require many labeled training examples due to their data-driven nature. To address this issue, this paper proposes an NLU-driven approach that forgoes the need for large numbers of labeled training examples. Inspired by how humans learn programming, this solution centers around Natural Language Understanding and draws on a novel graph-based mapping algorithm. The resulting NL programming framework, HISyn, uses no training examples, but gives synthesis accuracies comparable to data-driven methods trained on hundreds of samples. HISyn meanwhile demonstrates advantages in terms of interpretability, error diagnosis support, and cross-domain extensibility. To encourage adoption of HISyn among developers, the tool is made available as an extension for the Visual Studio Code IDE, thereby allowing users to easily submit inputs to HISyn and insert the generated code expressions into their active programs. A demo of the HISyn Extension can be found at https://youtu.be/KKOqJS24FNo.}, journal={2022 ACM/IEEE 44TH INTERNATIONAL CONFERENCE ON SOFTWARE ENGINEERING: COMPANION PROCEEDINGS (ICSE-COMPANION 2022)}, author={Young, Mitchell and Nan, Zifan and Shen, Xipeng}, year={2022}, pages={110–114} } @article{nan_dave_shen_liao_vanderbruggen_lin_emani_2022, title={Interactive NLU-Powered Ontology-Based Workflow Synthesis for FAIR Support of HPC}, DOI={10.1109/HUST56722.2022.00009}, abstractNote={Workflow synthesis is important for automatically creating the data processing workflow in a FAIR data management system for HPC. Previous methods are table-based, rigid and not scalable. This paper addresses these limitations by developing a new approach to workflow synthesis, interactive NLU-powered ontology-based workflow synthesis (INPOWS). IN-POWS allows the use of Natural Language for queries, maximizes the robustness in handling concepts and language ambiguities through an interactive ontology-based design, and achieves superior extensibility by adopting a synthesis algorithm powered by Natural Language Understanding. In our experiments, INPOWS shows the efficacy in enabling flexible, robust, and extensible workflow synthesis.}, journal={2022 IEEE/ACM INTERNATIONAL WORKSHOP ON HPC USER SUPPORT TOOLS (HUST)}, author={Nan, Zifan and Dave, Mithil and Shen, Xipeng and Liao, Chunhua and Vanderbruggen, Tristan and Lin, Pei-Hung and Emani, Murali}, year={2022}, pages={29–40} } @article{zhang_zhai_shen_mutlu_du_2022, title={POCLib: A High-Performance Framework for Enabling Near Orthogonal Processing on Compression}, volume={33}, ISSN={["1558-2183"]}, url={https://doi.org/10.1109/TPDS.2021.3093234}, DOI={10.1109/TPDS.2021.3093234}, abstractNote={Parallel technology boosts data processing in recent years, and parallel direct data processing on hierarchically compressed documents exhibits great promise. The high-performance direct data processing technique brings large savings in both time and space by removing the need for decompressing data. However, its benefits have been limited to data traversal operations; for random accesses, direct data processing is several times slower than the state-of-the-art baselines. This article proposes a novel concept, orthogonal processing on compression (orthogonal POC), which means that text analytics can be efficiently supported directly on compressed data, regardless of the type of the data processing – that is, the type of data processing is orthogonal to its capability of conducting POC. Previous proposals, such as TADOC, are not orthogonal POC. This article presents a set of techniques that successfully eliminate the limitation, and for the first time, establishes the near orthogonal POC feasibility of effectively handling both data traversal operations and random data accesses on hierarchically-compressed data. The work focuses on text data and yields a unified high-performance library, called POCLib. In a ten-node distributed Spark cluster on Amazon EC2, POCLib achieves 3.1× speedup over the state-of-the-art on random data accesses to compressed data, while preserving the capability of supporting traversal operations efficiently and providing large (3.9×) space savings.}, number={2}, journal={IEEE TRANSACTIONS ON PARALLEL AND DISTRIBUTED SYSTEMS}, author={Zhang, Feng and Zhai, Jidong and Shen, Xipeng and Mutlu, Onur and Du, Xiaoyong}, year={2022}, month={Feb}, pages={459–475} } @article{ye_xu_shen_jin_liao_solihin_2022, title={Preserving Addressability Upon GC-Triggered Data Movements on Non-Volatile Memory}, volume={19}, ISSN={["1544-3973"]}, DOI={10.1145/3511706}, abstractNote={This article points out an important threat that application-level Garbage Collection (GC) creates to the use of non-volatile memory (NVM). Data movements incurred by GC may invalidate the pointers to objects on NVM and, hence, harm the reusability of persistent data across executions. The article proposes the concept of movement-oblivious addressing (MOA), and develops and compares three novel solutions to materialize the concept for solving the addressability problem. It evaluates the designs on five benchmarks and a real-world application. The results demonstrate the promise of the proposed solutions, especially hardware-supported Multi-Level GPointer, in addressing the problem in a space- and time-efficient manner.}, number={2}, journal={ACM TRANSACTIONS ON ARCHITECTURE AND CODE OPTIMIZATION}, author={Ye, Chencheng and Xu, Yuanchao and Shen, Xipeng and Jin, Hai and Liao, Xiaofei and Solihin, Yan}, year={2022}, month={Jun} } @article{xia_shu_shen_menzies_2022, title={Sequential Model Optimization for Software Effort Estimation}, volume={48}, ISSN={["1939-3520"]}, url={https://doi.org/10.1109/TSE.2020.3047072}, DOI={10.1109/TSE.2020.3047072}, abstractNote={Many methods have been proposed to estimate how much effort is required to build and maintain software. Much of that research tries to recommend a single method – an approach that makes the dubious assumption that one method can handle the diversity of software project data. To address this drawback, we apply a configuration technique called “ROME” (Rapid Optimizing Methods for Estimation), which uses sequential model-based optimization (SMO) to find what configuration settings of effort estimation techniques work best for a particular data set. We test this method using data from 1161 traditional waterfall projects and 120 contemporary projects (from GitHub). In terms of magnitude of relative error and standardized accuracy, we find that ROME achieves better performance than the state-of-the-art methods for both traditional waterfall and contemporary projects. In addition, we conclude that we should not recommend one method for estimation. Rather, it is better to search through a wide range of different methods to find what works best for the local data. To the best of our knowledge, this is the largest effort estimation experiment yet attempted and the only one to test its methods on traditional waterfall and contemporary projects.}, number={6}, journal={IEEE TRANSACTIONS ON SOFTWARE ENGINEERING}, publisher={Institute of Electrical and Electronics Engineers (IEEE)}, author={Xia, Tianpei and Shu, Rui and Shen, Xipeng and Menzies, Tim}, year={2022}, month={Jun}, pages={1994–2009} } @article{xu_ye_shen_solihin_2022, title={Temporal Exposure Reduction Protection for Persistent Memory}, ISSN={["1530-0897"]}, DOI={10.1109/HPCA53966.2022.00071}, abstractNote={The long-living nature and byte-addressability of persistent memory (PM) amplifies the importance of strong memory protections. This paper develops temporal exposure reduction protection (TERP) as a framework for enforcing memory safety. Aiming to minimize the time when a PM region is accessible, TERP offers a complementary dimension of memory protection. The paper gives a formal definition of TERP, explores the semantics space of TERP constructs, and the relations with security and composability in both sequential and parallel executions. It proposes programming system and architecture solutions for the key challenges for the adoption of TERP, which draws on novel supports in both compilers and hardware to efficiently meet the exposure time target. Experiments validate the efficacy of the proposed support of TERP, in both efficiency and exposure time minimization.}, journal={2022 IEEE INTERNATIONAL SYMPOSIUM ON HIGH-PERFORMANCE COMPUTER ARCHITECTURE (HPCA 2022)}, author={Xu, Yuanchao and Ye, Chencheng and Shen, Xipeng and Solihin, Yan}, year={2022}, pages={908–924} } @article{sun_xie_shah_shen_2021, title={A Machine Learning Based Ensemble Forecasting Optimization Algorithm for Preseason Prediction of Atlantic Hurricane Activity}, volume={12}, ISSN={["2073-4433"]}, url={https://doi.org/10.3390/atmos12040522}, DOI={10.3390/atmos12040522}, abstractNote={In this study, nine different statistical models are constructed using different combinations of predictors, including models with and without projected predictors. Multiple machine learning (ML) techniques are employed to optimize the ensemble predictions by selecting the top performing ensemble members and determining the weights for each ensemble member. The ML-Optimized Ensemble (ML-OE) forecasts are evaluated against the Simple-Averaging Ensemble (SAE) forecasts. The results show that for the response variables that are predicted with significant skill by individual ensemble members and SAE, such as Atlantic tropical cyclone counts, the performance of SAE is comparable to the best ML-OE results. However, for response variables that are poorly modeled by individual ensemble members, such as Atlantic and Gulf of Mexico major hurricane counts, ML-OE predictions often show higher skill score than individual model forecasts and the SAE predictions. However, neither SAE nor ML-OE was able to improve the forecasts of the response variables when all models show consistent bias. The results also show that increasing the number of ensemble members does not necessarily lead to better ensemble forecasts. The best ensemble forecasts are from the optimally combined subset of models.}, number={4}, journal={ATMOSPHERE}, publisher={MDPI AG}, author={Sun, Xia and Xie, Lian and Shah, Shahil Umeshkumar and Shen, Xipeng}, year={2021}, month={Apr} } @article{guan_shen_krim_2021, title={An Automatic Synthesizer of Advising Tools for High Performance Computing}, volume={32}, ISSN={["1558-2183"]}, DOI={10.1109/TPDS.2020.3018636}, abstractNote={This article presents Egeria, the first automatic synthesizer of advising tools for High-Performance Computing (HPC). When one provides it with some HPC programming guides as inputs, Egeria automatically constructs a text retrieval tool that can advise on what to do to improve the performance of a given program. The advising tool provides a concise list of essential rules automatically extracted from the documents and can retrieve relevant optimization knowledge for optimization questions. Egeria is built based on a distinctive multi-layered design that leverages natural language processing (NLP) techniques and extends them with HPC-specific knowledge and considerations. This article presents the design, implementation, and both quantitative and qualitative evaluation results of Egeria.}, number={2}, journal={IEEE TRANSACTIONS ON PARALLEL AND DISTRIBUTED SYSTEMS}, author={Guan, Hui and Shen, Xipeng and Krim, Hamid}, year={2021}, month={Feb}, pages={330–341} } @article{zhao_niu_yuan_cai_sung_liu_liu_shen_ren_wang_et al._2021, title={Brief Industry Paper: Towards Real-Time 3D Object Detection for Autonomous Vehicles with Pruning Search}, ISSN={["1545-3421"]}, DOI={10.1109/RTAS52030.2021.00043}, abstractNote={In autonomous driving, 3D object detection is es-sential as it provides basic knowledge about the environment. However, as deep learning based 3D detection methods are usually computation intensive, it is challenging to support realtime 3D object detection on edge-computing devices in selfdriving cars with limited computation and memory resources. To facilitate this, we propose a compiler-aware pruning search framework, to achieve real-time inference of 3D object detection on the resource-limited mobile devices. Specifically, a generator is applied to sample better pruning proposals in the search space based on current proposals with their performance, and an evaluator is adopted to evaluate the sampled pruning proposal performance. To accelerate the search, the evaluator employs Bayesian optimization with an ensemble of neural predictors. We demonstrate in experiments that for the first time, the pruning search framework can achieve real-time 3D object detection on mobile (Samsung Galaxy S20 phone) with state-of-the-art detection performance.}, journal={2021 IEEE 27TH REAL-TIME AND EMBEDDED TECHNOLOGY AND APPLICATIONS SYMPOSIUM (RTAS 2021)}, author={Zhao, Pu and Niu, Wei and Yuan, Geng and Cai, Yuxuan and Sung, Hsin-Hsuan and Liu, Shaoshan and Liu, Sijia and Shen, Xipeng and Ren, Bin and Wang, Yanzhi and et al.}, year={2021}, pages={425–428} } @article{guan_liu_ma_niu_ren_shen_wang_zhao_2021, title={CoCoPIE: Enabling Real-Time AI on Off-the-Shelf Mobile Devices via Compression-Compilation Co-Design}, volume={64}, ISSN={["1557-7317"]}, DOI={10.1145/3418297}, abstractNote={A new framework allows intelligence on mainstream end devices without special hardware.}, number={6}, journal={COMMUNICATIONS OF THE ACM}, author={Guan, Hui and Liu, Shaoshan and Ma, Xiaolong and Niu, Wei and Ren, Bin and Shen, Xipeng and Wang, Yanzhi and Zhao, Pu}, year={2021}, month={Jun}, pages={62–68} } @article{shen_zhang_dea_andow_arroyo-fang_gafter_george_grueter_meijer_shivers_et al._2021, title={Coarsening Optimization for Differentiable Programming}, volume={5}, ISSN={["2475-1421"]}, url={https://doi.org/10.1145/3485507}, DOI={10.1145/3485507}, abstractNote={This paper presents a novel optimization for differentiable programming named coarsening optimization. It offers a systematic way to synergize symbolic differentiation and algorithmic differentiation (AD). Through it, the granularity of the computations differentiated by each step in AD can become much larger than a single operation, and hence lead to much reduced runtime computations and data allocations in AD. To circumvent the difficulties that control flow creates to symbolic differentiation in coarsening, this work introduces phi-calculus, a novel method to allow symbolic reasoning and differentiation of computations that involve branches and loops. It further avoids "expression swell" in symbolic differentiation and balance reuse and coarsening through the design of reuse-centric segment of interest identification. Experiments on a collection of real-world applications show that coarsening optimization is effective in speeding up AD, producing several times to two orders of magnitude speedups.}, number={OOPSLA}, journal={PROCEEDINGS OF THE ACM ON PROGRAMMING LANGUAGES-PACMPL}, publisher={Association for Computing Machinery (ACM)}, author={Shen, Xipeng and Zhang, Guoqiang and Dea, Irene and Andow, Samantha and Arroyo-Fang, Emilio and Gafter, Neal and George, Johann and Grueter, Melissa and Meijer, Erik and Shivers, Olin Grigsby and et al.}, year={2021}, month={Oct} } @article{zhang_pan_zhou_zhai_shen_mutlu_du_2021, title={G-TADOC: Enabling Efficient GPU-Based Text Analytics without Decompression}, ISSN={["1084-4627"]}, DOI={10.1109/ICDE51399.2021.00148}, abstractNote={Text analytics directly on compression (TADOC) has proven to be a promising technology for big data analytics. GPUs are extremely popular accelerators for data analytics systems. Unfortunately, no work so far shows how to utilize GPUs to accelerate TADOC. We describe G-TADOC, the first framework that provides GPU-based text analytics directly on compression, effectively enabling efficient text analytics on GPUs without decompressing the input data. G-TADOC solves three major challenges. First, TADOC involves a large amount of dependencies, which makes it difficult to exploit massive parallelism on a GPU. We develop a novel fine-grained thread-level workload scheduling strategy for GPU threads, which partitions heavily-dependent loads adaptively in a fine-grained manner. Second, in developing G-TADOC, thousands of GPU threads writing to the same result buffer leads to inconsistency while directly using locks and atomic operations lead to large synchronization overheads. We develop a memory pool with thread-safe data structures on GPUs to handle such difficulties. Third, maintaining the sequence information among words is essential for lossless compression. We design a sequence-support strategy, which maintains high GPU parallelism while ensuring sequence information. Our experimental evaluations show that G-TADOC provides 31.1x average speedup compared to state-of-the-art TADOC.}, journal={2021 IEEE 37TH INTERNATIONAL CONFERENCE ON DATA ENGINEERING (ICDE 2021)}, author={Zhang, Feng and Pan, Zaifeng and Zhou, Yanliang and Zhai, Jidong and Shen, Xipeng and Mutlu, Onur and Du, Xiaoyong}, year={2021}, pages={1679–1690} } @article{liao_lin_verma_vanderbruggen_emani_nan_shen_2021, title={HPC Ontology: Towards a Unified Ontology for Managing Training Datasets and AI Models for High-Performance Computing}, DOI={10.1109/MLHPC54614.2021.00012}, abstractNote={Machine learning (ML) techniques have been widely studied to address various challenges of productively and efficiently running large-scale scientific applications on heterogeneous supercomputers. However, it is extremely difficult to generate, access, and maintain training datasets and AI models to accelerate ML-based research. The Future of Research Communications and e-Scholarship has proposed the FAIR data principles describing Findability, Accessibility, Interoperability, and Reusability. In this paper, we present our ongoing work of designing an ontology for high-performance computing (named HPC ontology) in order to make training datasets and AI models FAIR. Our ontology provides controlled vocabularies, explicit semantics, and formal knowledge representations. Our design uses an extensible two-level pattern, capturing both high-level meta information and low-level data content for software, hardware, experiments, workflows, training datasets, AI models, and so on. Preliminary evaluation shows that HPC ontology is effective to annotate selected data and support a set of SPARQL queries.}, journal={PROCEEDINGS OF THE WORKSHOP ON MACHINE LEARNING IN HIGH PERFORMANCE COMPUTING ENVIRONMENTS (MLHPC 2021)}, author={Liao, Chunhua and Lin, Pei-Hung and Verma, Gaurav and Vanderbruggen, Tristan and Emani, Murali and Nan, Zifan and Shen, Xipeng}, year={2021}, pages={69–80} } @article{verma_emani_liao_lin_vanderbruggen_shen_chapman_2021, title={HPCFAIR: Enabling FAIR AI for HPC Applications}, DOI={10.1109/MLHPC54614.2021.00011}, abstractNote={Artificial Intelligence (AI) is being adopted in different domains at an unprecedented scale. A significant interest in the scientific community also involves leveraging machine learning (ML) to effectively run high performance computing applications at scale. Given multiple efforts in this arena, there are often duplicated efforts when existing rich data sets and ML models could be leveraged instead. The primary challenge is a lack of an ecosystem to reuse and reproduce the models and datasets. In this work, we propose HPCFAIR, a modular, extensible framework to enable AI models to be Findable, Accessible, Interoperable and Reproducible (FAIR). It enables users with a structured approach to search, load, save and reuse the models in their codes. We present the design, implementation of our framework and highlight how it can be seamlessly integrated to ML-driven applications for high performance computing applications and scientific machine learning workloads.}, journal={PROCEEDINGS OF THE WORKSHOP ON MACHINE LEARNING IN HIGH PERFORMANCE COMPUTING ENVIRONMENTS (MLHPC 2021)}, author={Verma, Gaurav and Emani, Murali and Liao, Chunhua and Lin, Pei-Hung and Vanderbruggen, Tristan and Shen, Xipeng and Chapman, Barbara}, year={2021}, pages={58–68} } @article{ye_xu_shen_liao_jin_solihin_2021, title={Hardware-Based Address-Centric Acceleration of Key-Value Store}, ISSN={["1530-0897"]}, DOI={10.1109/HPCA51647.2021.00067}, abstractNote={Efficiently retrieving data is essential for key-value store applications. A major part of the retrieving time is on data addressing, that is, finding the location of the value in memory that corresponds to a key. This paper introduces an address-centric approach to speed up the addressing by creating a shortcut for the translation of a key to the physical address of the value. The new technique is materialized with a novel in-memory table, STLT, a virtual-physical address buffer, and two new instructions. It creates a fast path for data addressing and meanwhile opens up opportunities for the use of simpler and faster hash tables to strike a better tradeoff between hashing conflicts and hashing overhead. Together, the new technique brings up to 1.4× speedups on key-value store application Redis and up to 13× speedups on some widely used indexing data structures, consistently outperforming prior solutions significantly.}, journal={2021 27TH IEEE INTERNATIONAL SYMPOSIUM ON HIGH-PERFORMANCE COMPUTER ARCHITECTURE (HPCA 2021)}, author={Ye, Chencheng and Xu, Yuanchao and Shen, Xipeng and Liao, Xiaofei and Jin, Hai and Solihin, Yan}, year={2021}, pages={736–748} } @article{guan_chaudhary_xu_ning_zhang_shen_2021, title={Recurrent Neural Networks Meet Context-Free Grammar: Two Birds with One Stone}, ISSN={["1550-4786"]}, DOI={10.1109/ICDM51629.2021.00125}, abstractNote={Recurrent Neural Networks (RNN) are widely used for various prediction tasks on sequences such as text, speed signals, program traces, and system logs. Due to RNNs’ inherently sequential behavior, one key challenge for the effective adoption of RNNs is to reduce the time spent on RNN inference and to increase the scope of a prediction. This work introduces CFG-guided compressed learning, an approach that creatively integrates Context-Free Grammar (CFG) and online tokenization into RNN learning and inference for streaming inputs. Through a hierarchical compression algorithm, it compresses an input sequence to a CFG and makes predictions based on the compressed sequence. Its algorithm design employs a set of techniques to overcome the issues from the myopic nature of online tokenization, the tension between inference accuracy and compression rate, and other complexities. Experiments on 16 real-world sequences of various types validate that the proposed compressed learning can successfully recognize and leverage repetitive patterns in input sequences, and effectively translate them into dramatic (1-1762×) inference speedups as well as much (1-7830×) expanded prediction scope, while keeping the inference accuracy satisfactory.}, journal={2021 21ST IEEE INTERNATIONAL CONFERENCE ON DATA MINING (ICDM 2021)}, author={Guan, Hui and Chaudhary, Umang and Xu, Yuanchao and Ning, Lin and Zhang, Lijun and Shen, Xipeng}, year={2021}, pages={1078–1083} } @article{zhang_guan_ding_shen_krim_2021, title={Reuse-centric k-means configuration}, volume={100}, ISSN={["1873-6076"]}, url={https://doi.org/10.1016/j.is.2021.101787}, DOI={10.1016/j.is.2021.101787}, abstractNote={K-means configuration is to find a configuration of k-means (e.g., the number of clusters, feature sets) that maximize some objectives. It is a time-consuming process due to the iterative nature of k-means. This paper proposes reuse-centric k-means configuration to accelerate k-means configuration. It is based on the observation that the explorations of different configurations share lots of common or similar computations. Effectively reusing the computations from prior trials of different configurations could largely shorten the configuration time. To materialize the idea, the paper presents a set of novel techniques, including reuse-based filtering, center reuse, and a two-phase design to capitalize on the reuse opportunities on three levels: validation, number of clusters, and feature sets. Experiments on k-means–based data classification tasks show that reuse-centric k-means configuration can speed up a heuristic search-based configuration process by a factor of 5.8, and a uniform search-based attainment of classification error surfaces by a factor of 9.1. The paper meanwhile provides some important insights on how to effectively apply the acceleration techniques to tap into a full potential.}, journal={INFORMATION SYSTEMS}, author={Zhang, Lijun and Guan, Hui and Ding, Yufei and Shen, Xipeng and Krim, Hamid}, year={2021}, month={Sep} } @article{yang_shen_lim_2021, title={Revisit the Scalability of Deep Auto-Regressive Models for Graph Generation}, ISSN={["2161-4393"]}, DOI={10.1109/IJCNN52387.2021.9534206}, abstractNote={As a new promising approach to graph generations, deep auto-regressive graph generation has drawn increasing attention. It however has been commonly deemed as hard to scale up to work with large graphs. In existing studies, it is perceived that the consideration of the full non-local graph dependences is indispensable for this approach to work, which entails the needs for keeping the entire graph's info in memory and hence the perceived “inherent” scalability limitation of the approach. This paper revisits the common perception. It proposes three ways to relax the dependences and conducts a series of empirical measurements. It concludes that the perceived “inherent” scalability limitation is a misperception; with the right design and implementation, deep auto-regressive graph generation can be applied to graphs much larger than the device memory. The rectified perception removes a fundamental barrier for this approach to meet practical needs.}, journal={2021 INTERNATIONAL JOINT CONFERENCE ON NEURAL NETWORKS (IJCNN)}, author={Yang, Shuai and Shen, Xipeng and Lim, Seung-Hwan}, year={2021} } @article{ul mustafa_xu_shen_solihin_2021, title={Seeds of SEED: New Security Challenges for Persistent Memory}, DOI={10.1109/SEED51797.2021.00020}, abstractNote={Persistent Memeory Object (PMO) is a general system abstraction for holding persistent data in persistent main memory, managed by an operating system. PMO programming model breaks inter-process isolation as it results in sharing of persistent data between two processes as they alternatively access the same PMO. The uncoordinated data-access opens a new avenue for cross-run and cross-process security attacks.In this paper, we discuss threat vulnerabilities that are either new or increased in intensity under PMO programming model. We also discuss security implications of using the PMO, highlighting sample PMO-based attacks and potential strategies to defend against them.}, journal={2021 INTERNATIONAL SYMPOSIUM ON SECURE AND PRIVATE EXECUTION ENVIRONMENT DESIGN (SEED 2021)}, author={Ul Mustafa, Naveed and Xu, Yuanchao and Shen, Xipeng and Solihin, Yan}, year={2021}, pages={83–88} } @article{agrawal_yang_agrawal_yedida_shen_menzies_2021, title={Simpler Hyperparameter Optimization for Software Analytics: Why, How, When}, volume={48}, ISSN={0098-5589 1939-3520 2326-3881}, url={http://dx.doi.org/10.1109/TSE.2021.3073242}, DOI={10.1109/TSE.2021.3073242}, abstractNote={How can we make software analytics simpler and faster? One method is to match the complexity of analysis to the intrinsic complexity of the data being explored. For example, hyperparameter optimizers find the control settings for data miners that improve the predictions generated via software analytics. Sometimes, very fast hyperparameter optimization can be achieved by "DODGE-ing"; i.e. simply steering way from settings that lead to similar conclusions. But when is it wise to use that simple approach and when must we use more complex (and much slower) optimizers?} To answer this, we applied hyperparameter optimization to 120 SE data sets that explored bad smell detection, predicting Github issue close time, bug report analysis, defect prediction, and dozens of other non-SE problems. We find that the simple DODGE works best for data sets with low "intrinsic dimensionality" (u ~ 3) and very poorly for higher-dimensional data (u > 8). Nearly all the SE data seen here was intrinsically low-dimensional, indicating that DODGE is applicable for many SE analytics tasks.}, number={8}, journal={IEEE Transactions on Software Engineering}, publisher={Institute of Electrical and Electronics Engineers (IEEE)}, author={Agrawal, Amritanshu and Yang, Xueqi and Agrawal, Rishabh and Yedida, Rahul and Shen, Xipeng and Menzies, Tim}, year={2021}, pages={1–1} } @article{ye_xu_shen_liao_jin_solihin_2021, title={Supporting Legacy Libraries on Non-Volatile Memory: A User-Transparent Approach}, ISSN={["1063-6897"]}, DOI={10.1109/ISCA52012.2021.00042}, abstractNote={As mainstream computing is poised to embrace the advent of byte-addressable non-volatile memory (NVM), an important roadblock has remained largely unnoticed, support of legacy libraries on NVM. Libraries underpin modern software everywhere. As current NVM programming interfaces all designate special types and constructs for NVM objects and references, legacy libraries, being incompatible with these data types, will face major obstacles for working with future applications written for NVM. This paper introduces a simple approach to mitigating the issue. The novel approach centers around user-transparent persistent reference, a new concept that allows programmers to reference a persistent object in the same way as reference a normal (volatile) object. The paper presents the implementation of the concept, carefully examines its soundness, and describes compiler and simple architecture support for keeping performance overheads very low.}, journal={2021 ACM/IEEE 48TH ANNUAL INTERNATIONAL SYMPOSIUM ON COMPUTER ARCHITECTURE (ISCA 2021)}, author={Ye, Chencheng and Xu, Yuanchao and Shen, Xipeng and Liao, Xiaofei and Jin, Hai and Solihin, Yan}, year={2021}, pages={443–455} } @article{zhang_zhai_shen_wang_chen_mutlu_chen_du_2021, title={TADOC: Text analytics directly on compression}, volume={30}, ISSN={["0949-877X"]}, DOI={10.1007/s00778-020-00636-3}, abstractNote={This article provides a comprehensive description of Text Analytics Directly on Compression (TADOC), which enables direct document analytics on compressed textual data. The article explains the concept of TADOC and the challenges to its effective realizations. Additionally, a series of guidelines and technical solutions that effectively address those challenges, including the adoption of a hierarchical compression method and a set of novel algorithms and data structure designs, are presented. Experiments on six data analytics tasks of various complexities show that TADOC can save 90.8% storage space and 87.9% memory usage, while halving data processing times.}, number={2}, journal={VLDB JOURNAL}, author={Zhang, Feng and Zhai, Jidong and Shen, Xipeng and Wang, Dalin and Chen, Zheng and Mutlu, Onur and Chen, Wenguang and Du, Xiaoyong}, year={2021}, month={Mar}, pages={163–188} } @article{tan_chen_liu_ren_song_shen_liu_2021, title={Toward Efficient Interactions between Python and Native Libraries}, DOI={10.1145/3468264.3468541}, abstractNote={Python has become a popular programming language because of its excellent programmability. Many modern software packages utilize Python for high-level algorithm design and depend on native libraries written in C/C++/Fortran for efficient computation kernels. Interaction between Python code and native libraries introduces performance losses because of the abstraction lying on the boundary of Python and native libraries. On the one side, Python code, typically run with interpretation, is disjoint from its execution behavior. On the other side, native libraries do not include program semantics to understand algorithm defects.}, journal={PROCEEDINGS OF THE 29TH ACM JOINT MEETING ON EUROPEAN SOFTWARE ENGINEERING CONFERENCE AND SYMPOSIUM ON THE FOUNDATIONS OF SOFTWARE ENGINEERING (ESEC/FSE '21)}, author={Tan, Jialiang and Chen, Yu and Liu, Zhenming and Ren, Bin and Song, Shuaiwen Leon and Shen, Xipeng and Liu, Xu}, year={2021}, pages={1117–1128} } @article{zhang_xu_shen_dillig_2021, title={UDF to SQL Translation through Compositional Lazy Inductive Synthesis}, volume={5}, ISSN={["2475-1421"]}, url={https://doi.org/10.1145/3485489}, DOI={10.1145/3485489}, abstractNote={Many data processing systems allow SQL queries that call user-defined functions (UDFs) written in conventional programming languages. While such SQL extensions provide convenience and flexibility to users, queries involving UDFs are not as efficient as their pure SQL counterparts that invoke SQL’s highly-optimized built-in functions. Motivated by this problem, we propose a new technique for translating SQL queries with UDFs to pure SQL expressions. Unlike prior work in this space, our method is not based on syntactic rewrite rules and can handle a much more general class of UDFs. At a high-level, our method is based on counterexample-guided inductive synthesis (CEGIS) but employs a novel compositional strategy that decomposes the synthesis task into simpler sub-problems. However, because there is no universal decomposition strategy that works for all UDFs, we propose a novel lazy inductive synthesis approach that generates a sequence of decompositions that correspond to increasingly harder inductive synthesis problems. Because most realistic UDF-to-SQL translation tasks are amenable to a fine-grained decomposition strategy, our lazy inductive synthesis method scales significantly better than traditional CEGIS. We have implemented our proposed technique in a tool called CLIS for optimizing Spark SQL programs containing Scala UDFs. To evaluate CLIS, we manually study 100 randomly selected UDFs and find that 63 of them can be expressed in pure SQL. Our evaluation on these 63 UDFs shows that CLIS can automatically synthesize equivalent SQL expressions in 92% of the cases and that it can solve 2.4× more benchmarks compared to a baseline that does not use our compositional approach. We also show that CLIS yields an average speed-up of 3.5× for individual UDFs and 1.3× to 3.1× in terms of end-to-end application performance.}, number={OOPSLA}, journal={PROCEEDINGS OF THE ACM ON PROGRAMMING LANGUAGES-PACMPL}, publisher={Association for Computing Machinery (ACM)}, author={Zhang, Guoqiang and Xu, Yuanchao and Shen, Xipeng and Dillig, Isil}, year={2021}, month={Oct} } @article{li_zhang_shen_2020, title={DIAC An Inter-app Conflicts Detector for Open IoT Systems}, volume={19}, ISSN={["1558-3465"]}, DOI={10.1145/3391895}, abstractNote={This article tackles the problem of detecting and solving potential conflicts among independently developed apps that are to be installed into an open Internet-of-Things (IoT) environment. It provides a new set of definitions and categorizations of the conflicts to more precisely characterize the nature of the problem, and it proposes a representation named “IA Graphs” for formally representing IoT controls and inter-app interplays. Based on the definitions and representation, it then describes an efficient conflict detection algorithm. Combining conflict categories, seriousness indicator, and conflict frequency, an innovative solution policy for solving various detected conflicts is developed, which also takes into account user preference and interest by providing interactive process. It implements a compiler and runtime software system that integrates all the proposed techniques together into a comprehensive solution. Experiments on SmartThings apps validate its significantly better detection efficacy over prior methods and effectiveness of conflict solution with user preference.}, number={6}, journal={ACM TRANSACTIONS ON EMBEDDED COMPUTING SYSTEMS}, author={Li, Xinyi and Zhang, Lei and Shen, Xipeng}, year={2020}, month={Nov} } @inproceedings{zhang_zhai_shen_mutlu_du_2020, title={Enabling Efficient Random Access to Hierarchically-Compressed Data}, ISSN={["1084-4627"]}, DOI={10.1109/ICDE48307.2020.00097}, abstractNote={Recent studies have shown the promise of direct data processing on hierarchically-compressed text documents. By removing the need for decompressing data, the direct data processing technique brings large savings in both time and space. However, its benefits have been limited to data traversal operations; for random accesses, direct data processing is several times slower than the state-of-the-art baselines. This paper presents a set of techniques that successfully eliminate the limitation, and for the first time, establishes the feasibility of effectively handling both data traversal operations and random data accesses on hierarchically-compressed data. The work yields a new library, which achieves 3.1× speedup over the state-of-the-art on random data accesses to compressed data, while preserving the capability of supporting traversal operations efficiently and providing large (3.9×) space savings.}, booktitle={2020 IEEE 36th International Conference on Data Engineering (ICDE)}, author={Zhang, Feng and Zhai, Jidong and Shen, Xipeng and Mutlu, Onur and Du, Xiaoyong}, year={2020}, pages={1069–1080} } @article{zhou_zhao_shen_chen_2020, title={Enabling Runtime SpMV Format Selection through an Overhead Conscious Method}, volume={31}, DOI={10.1109/TPDS.2019.2932931}, abstractNote={Sparse matrix-vector multiplication (SpMV) is an important kernel and its performance is critical for many applications. Storage format selection is to select the best format to store a sparse matrix; it is essential for SpMV performance. Prior studies have focused on predicting the format that helps SpMV run fastest, but have ignored the runtime prediction and format conversion overhead. This work shows that the runtime overhead makes the predictions from previous solutions frequently sub-optimal and sometimes inferior regarding the end-to-end time. It proposes a new paradigm for SpMV storage selection, an overhead-conscious method. Through carefully designed regression models and neural network-based time series prediction models, the method captures the influence imposed on the overall program performance by the overhead and the benefits of format prediction and conversions. The method employs a novel two-stage lazy-and-light scheme to help control the possible negative effects of format predictions, and at the same time, maximize the overall format conversion benefits. Experiments show that the technique outperforms previous techniques significantly. It improves the overall performance of applications by 1.21X to 1.53X, significantly larger than the 0.83X to 1.25X upper-bound speedups overhead-oblivious methods could give.}, number={1}, journal={IEEE Transactions on Parallel and Distributed Systems}, author={Zhou, Weijie and Zhao, Yue and Shen, Xipeng and Chen, Wang}, year={2020}, month={Jan}, pages={80–93} } @article{oh_zheng_shen_zhai_yi_2020, title={GOPipe: A Granularity-Oblivious Programming Framework for Pipelined Stencil Executions on GPU}, ISSN={["1089-795X"]}, DOI={10.1145/3410463.3414656}, abstractNote={Recent studies have shown promising performance benefits when multiple stages of a pipelined stencil application are mapped to different parts of a GPU to run concurrently. An important factor for the computing efficiency of such pipelines is the granularity of a task. In previous programming frameworks that support true pipelined computations on GPU, the choice has to be made by the programmers during the application development time. Due to many difficulties, programmers' decisions are often far from optimal, causing inferior performance and performance portability.}, journal={PACT '20: PROCEEDINGS OF THE ACM INTERNATIONAL CONFERENCE ON PARALLEL ARCHITECTURES AND COMPILATION TECHNIQUES}, author={Oh, Chanyoung and Zheng, Zhen and Shen, Xipeng and Zhai, Jidong and Yi, Youngmin}, year={2020}, pages={43–54} } @article{zhou_zhao_zhang_shen_2020, title={HARP: Holistic Analysis for Refactoring Python-Based Analytics Programs}, ISSN={["0270-5257"]}, DOI={10.1145/3377811.3380434}, abstractNote={Modern machine learning programs are often written in Python, with the main computations specified through calls to some highly optimized libraries (e.g., TensorFlow, PyTorch). How to maximize the computing efficiency of such programs is essential for many application domains, which has drawn lots of recent attention. This work points out a common limitation in existing efforts: they focus their views only on the static computation graphs specified by library APIs, but leave the influence from the hosting Python code largely unconsidered. The limitation often causes them to miss the big picture and hence many important optimization opportunities. This work proposes a new approach named HARP to address the problem. HARP enables holistic analysis that spans across computation graphs and their hosting Python code. HARP achieves it through a set of novel techniques: analytics-conscious speculative analysis to circumvent Python complexities, a unified representation augmented computation graphs to capture all dimensions of knowledge related with the holistic analysis, and conditioned feedback mechanism to allow risk-controlled aggressive analysis. Refactoring based on HARP gives 1.3--3X and 2.07X average speedups on a set of TensorFlow and PyTorch programs.}, journal={2020 ACM/IEEE 42ND INTERNATIONAL CONFERENCE ON SOFTWARE ENGINEERING (ICSE 2020)}, author={Zhou, Weijie and Zhao, Yue and Zhang, Guoqiang and Shen, Xipeng}, year={2020}, pages={506–517} } @article{xu_ye_solihin_shen_2020, title={Hardware-Based Domain Virtualization for Intra-Process Isolation of Persistent Memory Objects}, ISSN={["0884-7495"]}, DOI={10.1109/ISCA45697.2020.00062}, abstractNote={Persistent memory has appealing properties in serving as main memory. While file access is protected by system calls, an attached persistent memory object (PMO) is one load/store away from accidental (or malicious) reads or writes, which may arise from use of just one buggy library. The recent progress in intra-process isolation could potentially protect PMO by enabling a process to partition sensitive data and code into isolated components. However, the existing intra-process isolations (e.g., Intel MPK) support isolation of only up to 16 domains, forming a major barrier for PMO protections. Although there is some recent effort trying to virtualize MPK to circumvent the limit, it suffers large overhead. This paper presents two novel architecture supports, which provide 11 - 52 × higher efficiency while offering the first known domain-based protection for PMOs.}, journal={2020 ACM/IEEE 47TH ANNUAL INTERNATIONAL SYMPOSIUM ON COMPUTER ARCHITECTURE (ISCA 2020)}, author={Xu, Yuanchao and Ye, ChenCheng and Solihin, Yan and Shen, Xipeng}, year={2020}, pages={680–692} } @article{xu_solihin_shen_2020, title={MERR: Improving Security of Persistent Memory Objects via Efficient Memory Exposure Reduction and Randomization}, DOI={10.1145/3373376.3378492}, abstractNote={This paper proposes a new defensive technique for memory, especially useful for long-living objects on Non-Volatile Memory (NVM), or called Persistent Memory objects (PMOs). The method takes a distinctive perspective, trying to reduce memory exposure time by largely shortening the overhead in attaching and detaching PMOs into the memory space. It does it through a novel idea, embedding page table subtrees inside PMOs. The paper discusses the complexities the technique brings, to permission controls and hardware implementations, and provides solutions. Experimental results show that the new technique reduces memory exposure time by 60% with a 5% time overhead (70% with 10.9% overhead). It allows much more frequent address randomizations (shortening the period from seconds to less than 41.4us), offering significant potential for enhancing memory security.}, journal={TWENTY-FIFTH INTERNATIONAL CONFERENCE ON ARCHITECTURAL SUPPORT FOR PROGRAMMING LANGUAGES AND OPERATING SYSTEMS (ASPLOS XXV)}, author={Xu, Yuanchao and Solihin, Yan and Shen, Xipeng}, year={2020}, pages={987–1000} } @article{jin_shen_lovas_liao_2020, title={Special Issue: Graph Computing}, volume={32}, ISSN={["1532-0634"]}, DOI={10.1002/cpe.5452}, abstractNote={Graph computing now is popular in many areas, including social network and gene sequence alignment. Graph computing system and algorithm have a history prior to the use of graph databases and have a future that is not necessarily entangled with typical database concerns. With the data's increasing size, many distributed graph-computing systems have been developed in recent years to process and analyze massive graphs. Researchers pay more attention on the graph partition schemes on distributed environment. However, other researchers think a single system can avoid the network overhead and may have better performance even if the data size is too big for the memory space. With the rapid development of coprocessors, some researchers think it is promising to build a domain specific computer, just for graph computing. The proposed special issue of Concurrency and Computation: Practice and Experience contains revised and extended versions of selected best papers with respect to graph computing at the 21st IEEE International Conference on Parallel and Distributed Systems (ICPADS’16), which was held at Wuhan, China, on December 13-16, 2016. Established in 1992, ICPADS has been a major international forum for scientists, engineers, and users to exchange and share their experiences, new ideas, and latest research results on all aspects of parallel and distributed computing systems. The purpose of this special issue is to provide a comprehensive view into recent advances in systems software, algorithms, partition schemes, and even graph computer based on new advances in computer architecture and applications. The five selected papers are summarized as follows. The first paper, titled “An efficient iterative graph data processing framework based on bulk synchronous parallel model” by Liu et al,1 presents an efficient computational framework for graph data processing based on the bulk synchronous parallel model. Existing Pregel-like graph processing systems remains in its early stage, and there still exist many challenges with prohibitive superstep-synchronized overhead. Furthermore, the graph data partition strategy in these earlier graph systems fails to support load balancing, therefore causing the increase of network I/O overhead as the scale of graph data grows. Thus, this paper leverages a global synchronization mechanism to enhance the performance of graph computation. Meanwhile, a balanced hash-based graph partition mechanism is presented to optimize the large-scale graph data processing. The work has a real implementation upon on Pregrel system, which can better support a variety of graph analytics applications. The second paper, titled “An efficient iterative graph data processing framework based on bulk synchronous parallel model” by Linchen Yu,2 proposes an optimized scheduling system for parallelizing the programs in the Xen. Virtualization challenges the traditional CPU scheduling, leading that the spin lock in virtualized environment can be preempted by the VMM, increasing synchronization overhead and decreasing the performance of parallel programs. Many studies have proposed the co-scheduling to alleviate this problem. However, these earlier attempts are not suitable to non-parallel workloads with the CPU fragmentation problem as well. Therefore, a simultaneous optimization scheduling system, called CCHybrid, is proposed in the Xen virtualized environment. Results show the efficiency of CCHybrid over the traditional Xen Credit scheduler. The third paper, titled “ms-PoSW: A multi-server aided proof of shared ownership scheme for secure deduplication in cloud” by Xiong et al,3 introduces a novel concept of the Proof for securing client-side deduplication of the shared files. With the rapid development of cloud computing and big data technologies, collaborative cloud applications are inextricably linked to our daily life and, therefore, produce a large number of shared files, which is challenging for secure access and data duplication in cloud. This paper proposes a novel multiserver-aided PoSW scheme for collaborative cloud applications and propose a hybrid PoSW scheme to reduce the computational cost of the shared owner's client. Furthermore, a hybrid PoSW scheme is constructed to address the secure proof of hybrid cloud architectures. The fourth paper, titled “Sparse random compressive sensing based data aggregation in wireless sensor networks” by Yin et al,4 introduces a compressive data aggregation scheme. In wireless sensor networks, the increasingly expanding data volume has high spatial-temporal correlation. Although some earlier studies attempt to eliminate data redundancy, few can handle energy consumption and latency simultaneously. In this paper, the authors a delay-minimum energy-balanced data aggregation method, which can eliminate the redundancy among the readings and prolong the network lifetime. A sparse random matrix is adopted as a measurement matrix to balance communication cost. Particularly, each measurement can form an aggregation tree with minimum delay. Furthermore, a novel scheduling method is used to avoid information interference as well. The fifth paper, titled “Dynamic cluster strategy for hierarchical rollback-recovery protocols in MPI HPC applications” by Liao et al,5 proposes a dynamic cluster strategy to adapt to the runtime variation of communication pattern by using a prediction scheme. The idea comes from a fact that Hierarchical rollback-recovery protocols provide failure containment and reduce the amount of message to be logged, making it an attractive and scalable solution for fault tolerance even at a large scale. This paper shows how the communication pattern changes with the stages of application because MPI HPC applications scale up and become more complex. Therefore, to further increase the efficiency of hierarchical rollback-recovery protocols, the authors propose a dynamic cluster strategy (DCS) to adapt to the change of communication pattern. In contrast to the existing static process partition algorithms, this strategy adopts a prediction mechanism by using the clusters of processes obtained from prior part of applications in the succeeding part. Detailed experiments are then performed to evaluate the effectiveness and efficiency DCS at an extremely large scale. We hope that the readers would find the contents of this special issue interesting and further inspire them to look ahead into the challenges of designing, exploring, and exploiting graph analytics applications.}, number={3}, journal={CONCURRENCY AND COMPUTATION-PRACTICE & EXPERIENCE}, author={Jin, Hai and Shen, Xipeng and Lovas, Robert and Liao, Xiaofei}, year={2020}, month={Feb} } @article{ning_guan_shen_2019, title={Adaptive Deep Reuse: Accelerating CNN Training on the Fly}, ISSN={["1084-4627"]}, DOI={10.1109/ICDE.2019.00138}, abstractNote={This work proposes adaptive deep reuse, a method for accelerating CNN training by identifying and avoiding the unnecessary computations contained in each specific training on the fly. It makes two-fold major contributions. (1) It empirically proves the existence of a lot of similarities among neuron vectors in both forward and backward propagation of CNN. (2) It introduces the first adaptive strategy for translating the similarities into computation reuse in CNN training. The strategy adaptively adjusts the strength of reuse based on the different tolerance of precision relaxation in different CNN training stages. Experiments show that adaptive deep reuse saves 69% CNN training time with no accuracy loss.}, journal={2019 IEEE 35TH INTERNATIONAL CONFERENCE ON DATA ENGINEERING (ICDE 2019)}, author={Ning, Lin and Guan, Hui and Shen, Xipeng}, year={2019}, pages={1538–1549} } @inproceedings{ning_shen_2019, title={Deep reuse}, ISBN={9781450360791}, url={http://dx.doi.org/10.1145/3330345.3330384}, DOI={10.1145/3330345.3330384}, abstractNote={This paper presents deep reuse, a method for speeding up CNN inferences by detecting and exploiting deep reusable computations on the fly. It empirically reveals the massive similarities among neuron vectors in activation maps, both within CNN inferences on an input and across inputs. It gives an in-depth study on how to effectively turn the similarities into beneficial computation reuse to speed up CNN inferences. The investigation covers various factors, ranging from the clustering methods for similarity detection, to clustering scopes, similarity metrics, and neuron vector granularities. The insights help create deep reuse. As an on-line method, deep reuse is easy to apply, and adapts to each CNN (compressed or not) and its input. Using no special hardware support or CNN model changes, this method speeds up inferences by 1.77--2X (up to 4.3X layer-wise) on the fly with virtually no (