@article{ye_xu_shen_sha_liao_jin_solihin_2023, title={Reconciling Selective Logging and Hardware Persistent Memory Transaction}, ISSN={["1530-0897"]}, DOI={10.1109/HPCA56546.2023.10071088}, abstractNote={Log creation, maintenance, and its persist ordering are known to be performance bottlenecks for durable transactions on persistent memory. Existing hardware persistent memory transactions overlook an important opportunity for improving performance: some persistent data is algorithmically redundant such that it can be recovered from other data, removing the need for logging such data. The paper presents an ISA extension that enables selective logging for hardware persistent memory transactions for the first time. The ISA extension features two novel components: fine-grain logging and lazy persistency. Fine-grain logging allows hardware to log updates on data in the granularity of words without lengthening the critical path of data accesses. Lazy persistency allows updated data to remain in the cache after the transaction commits. Together, the new hardware persistent memory transaction outperforms the state-of-the-art hardware counterpart by 1.8× on average.}, journal={2023 IEEE INTERNATIONAL SYMPOSIUM ON HIGH-PERFORMANCE COMPUTER ARCHITECTURE, HPCA}, author={Ye, Chencheng and Xu, Yuanchao and Shen, Xipeng and Sha, Yan and Liao, Xiaofei and Jin, Hai and Solihin, Yan}, year={2023}, pages={664–676} } @article{ye_xu_shen_sha_liao_jin_solihin_2023, title={SpecPMT: Speculative Logging for Resolving Crash Consistency Overhead of Persistent Memory}, DOI={10.1145/3575693.3575696}, abstractNote={Crash consistency overhead is a long-standing barrier to the adoption of byte-addressable persistent memory in practice. Despite continuous progress, persistent transactions for crash consistency still incur a 5.6X slowdown, making persistent memory prohibitively costly in practical settings. This paper introduces speculative logging, a new method that forgoes most memory fences and reduces data persistence overhead by logging data values early. This technique enables a novel persistent transaction model, speculatively persistent memory transactions (SpecPMT). Our evaluation shows that SpecPMT reduces the execution time overheads of persistent transactions substantially to just 10%.}, journal={PROCEEDINGS OF THE 28TH ACM INTERNATIONAL CONFERENCE ON ARCHITECTURAL SUPPORT FOR PROGRAMMING LANGUAGES AND OPERATING SYSTEMS, VOL 2, ASPLOS 2023}, author={Ye, Chencheng and Xu, Yuanchao and Shen, Xipeng and Sha, Yan and Liao, Xiaofei and Jin, Hai and Solihin, Yan}, year={2023}, pages={762–777} } @article{sung_xu_guan_niu_ren_wang_liu_shen_2022, title={Brief Industry Paper: Enabling Level-4 Autonomous Driving on a Single $1k Off-the-Shelf Card}, ISSN={["1545-3421"]}, DOI={10.1109/RTAS54340.2022.00032}, abstractNote={In the past few years we have developed hardware computing systems for commercial autonomous vehicles, but inevitably the high development cost and long turn-around time have been major roadblocks for commercial deployment. Hence we also explored the potential of software optimization. This paper, for the first-time, shows that it is feasible to enable full leve1-4 autonomous driving workloads on a single off-the-shelf card (Jetson AGX Xavier) for less than ${\$}1\mathrm{k}$, an order of magnitude less than the state-of-the-art systems, while meeting all the requirements of latency. The success comes from the resolution of some important issues shared by existing practices through a series of measures and innovations.}, journal={2022 IEEE 28TH REAL-TIME AND EMBEDDED TECHNOLOGY AND APPLICATIONS SYMPOSIUM (RTAS)}, author={Sung, Hsin-Hsuan and Xu, Yuanchao and Guan, Jiexiong and Niu, Wei and Ren, Bin and Wang, Yanzhi and Liu, Shaoshan and Shen, Xipeng}, year={2022}, pages={297–300} } @article{xu_ye_solihin_shen_2022, title={FFCCD: Fence-Free Crash-Consistent Concurrent Defragmentation for Persistent Memory}, ISSN={["1063-6897"]}, DOI={10.1145/3470496.3527406}, abstractNote={Persistent Memory (PM) is increasingly supplementing or substituting DRAM as main memory. Prior work have focused on reusability and memory leaks of persistent memory but have not addressed a problem amplified by persistence, persistent memory fragmentation, which refers to the continuous worsening of fragmentation of persistent memory throughout its usage. This paper reveals the challenges and proposes the first systematic crash-consistent solution, Fence-Free Crash-consistent Concurrent Defragmentation (FFCCD). FFCCD resues persistent pointer format, root nodes and typed allocation provided by persistent memory programming model to enable concurrent defragmentation on PM. FFCCD introduces architecture support for concurrent defragmentation that enables a fence-free design and fast read barrier, reducing two major overheads of defragmenting persistent memory. The techniques is effective (28--73% fragmentation reduction) and fast (4.1% execution time overhead).}, journal={PROCEEDINGS OF THE 2022 THE 49TH ANNUAL INTERNATIONAL SYMPOSIUM ON COMPUTER ARCHITECTURE (ISCA '22)}, author={Xu, Yuanchao and Ye, Chencheng and Solihin, Yan and Shen, Xipeng}, year={2022}, pages={274–288} } @article{ye_xu_shen_jin_liao_solihin_2022, title={Preserving Addressability Upon GC-Triggered Data Movements on Non-Volatile Memory}, volume={19}, ISSN={["1544-3973"]}, DOI={10.1145/3511706}, abstractNote={This article points out an important threat that application-level Garbage Collection (GC) creates to the use of non-volatile memory (NVM). Data movements incurred by GC may invalidate the pointers to objects on NVM and, hence, harm the reusability of persistent data across executions. The article proposes the concept of movement-oblivious addressing (MOA), and develops and compares three novel solutions to materialize the concept for solving the addressability problem. It evaluates the designs on five benchmarks and a real-world application. The results demonstrate the promise of the proposed solutions, especially hardware-supported Multi-Level GPointer, in addressing the problem in a space- and time-efficient manner.}, number={2}, journal={ACM TRANSACTIONS ON ARCHITECTURE AND CODE OPTIMIZATION}, author={Ye, Chencheng and Xu, Yuanchao and Shen, Xipeng and Jin, Hai and Liao, Xiaofei and Solihin, Yan}, year={2022}, month={Jun} } @article{xu_ye_shen_solihin_2022, title={Temporal Exposure Reduction Protection for Persistent Memory}, ISSN={["1530-0897"]}, DOI={10.1109/HPCA53966.2022.00071}, abstractNote={The long-living nature and byte-addressability of persistent memory (PM) amplifies the importance of strong memory protections. This paper develops temporal exposure reduction protection (TERP) as a framework for enforcing memory safety. Aiming to minimize the time when a PM region is accessible, TERP offers a complementary dimension of memory protection. The paper gives a formal definition of TERP, explores the semantics space of TERP constructs, and the relations with security and composability in both sequential and parallel executions. It proposes programming system and architecture solutions for the key challenges for the adoption of TERP, which draws on novel supports in both compilers and hardware to efficiently meet the exposure time target. Experiments validate the efficacy of the proposed support of TERP, in both efficiency and exposure time minimization.}, journal={2022 IEEE INTERNATIONAL SYMPOSIUM ON HIGH-PERFORMANCE COMPUTER ARCHITECTURE (HPCA 2022)}, author={Xu, Yuanchao and Ye, Chencheng and Shen, Xipeng and Solihin, Yan}, year={2022}, pages={908–924} } @article{ye_xu_shen_liao_jin_solihin_2021, title={Hardware-Based Address-Centric Acceleration of Key-Value Store}, ISSN={["1530-0897"]}, DOI={10.1109/HPCA51647.2021.00067}, abstractNote={Efficiently retrieving data is essential for key-value store applications. A major part of the retrieving time is on data addressing, that is, finding the location of the value in memory that corresponds to a key. This paper introduces an address-centric approach to speed up the addressing by creating a shortcut for the translation of a key to the physical address of the value. The new technique is materialized with a novel in-memory table, STLT, a virtual-physical address buffer, and two new instructions. It creates a fast path for data addressing and meanwhile opens up opportunities for the use of simpler and faster hash tables to strike a better tradeoff between hashing conflicts and hashing overhead. Together, the new technique brings up to 1.4× speedups on key-value store application Redis and up to 13× speedups on some widely used indexing data structures, consistently outperforming prior solutions significantly.}, journal={2021 27TH IEEE INTERNATIONAL SYMPOSIUM ON HIGH-PERFORMANCE COMPUTER ARCHITECTURE (HPCA 2021)}, author={Ye, Chencheng and Xu, Yuanchao and Shen, Xipeng and Liao, Xiaofei and Jin, Hai and Solihin, Yan}, year={2021}, pages={736–748} } @article{guan_chaudhary_xu_ning_zhang_shen_2021, title={Recurrent Neural Networks Meet Context-Free Grammar: Two Birds with One Stone}, ISSN={["1550-4786"]}, DOI={10.1109/ICDM51629.2021.00125}, abstractNote={Recurrent Neural Networks (RNN) are widely used for various prediction tasks on sequences such as text, speed signals, program traces, and system logs. Due to RNNs’ inherently sequential behavior, one key challenge for the effective adoption of RNNs is to reduce the time spent on RNN inference and to increase the scope of a prediction. This work introduces CFG-guided compressed learning, an approach that creatively integrates Context-Free Grammar (CFG) and online tokenization into RNN learning and inference for streaming inputs. Through a hierarchical compression algorithm, it compresses an input sequence to a CFG and makes predictions based on the compressed sequence. Its algorithm design employs a set of techniques to overcome the issues from the myopic nature of online tokenization, the tension between inference accuracy and compression rate, and other complexities. Experiments on 16 real-world sequences of various types validate that the proposed compressed learning can successfully recognize and leverage repetitive patterns in input sequences, and effectively translate them into dramatic (1-1762×) inference speedups as well as much (1-7830×) expanded prediction scope, while keeping the inference accuracy satisfactory.}, journal={2021 21ST IEEE INTERNATIONAL CONFERENCE ON DATA MINING (ICDM 2021)}, author={Guan, Hui and Chaudhary, Umang and Xu, Yuanchao and Ning, Lin and Zhang, Lijun and Shen, Xipeng}, year={2021}, pages={1078–1083} } @article{ul mustafa_xu_shen_solihin_2021, title={Seeds of SEED: New Security Challenges for Persistent Memory}, DOI={10.1109/SEED51797.2021.00020}, abstractNote={Persistent Memeory Object (PMO) is a general system abstraction for holding persistent data in persistent main memory, managed by an operating system. PMO programming model breaks inter-process isolation as it results in sharing of persistent data between two processes as they alternatively access the same PMO. The uncoordinated data-access opens a new avenue for cross-run and cross-process security attacks.In this paper, we discuss threat vulnerabilities that are either new or increased in intensity under PMO programming model. We also discuss security implications of using the PMO, highlighting sample PMO-based attacks and potential strategies to defend against them.}, journal={2021 INTERNATIONAL SYMPOSIUM ON SECURE AND PRIVATE EXECUTION ENVIRONMENT DESIGN (SEED 2021)}, author={Ul Mustafa, Naveed and Xu, Yuanchao and Shen, Xipeng and Solihin, Yan}, year={2021}, pages={83–88} } @article{ye_xu_shen_liao_jin_solihin_2021, title={Supporting Legacy Libraries on Non-Volatile Memory: A User-Transparent Approach}, ISSN={["1063-6897"]}, DOI={10.1109/ISCA52012.2021.00042}, abstractNote={As mainstream computing is poised to embrace the advent of byte-addressable non-volatile memory (NVM), an important roadblock has remained largely unnoticed, support of legacy libraries on NVM. Libraries underpin modern software everywhere. As current NVM programming interfaces all designate special types and constructs for NVM objects and references, legacy libraries, being incompatible with these data types, will face major obstacles for working with future applications written for NVM. This paper introduces a simple approach to mitigating the issue. The novel approach centers around user-transparent persistent reference, a new concept that allows programmers to reference a persistent object in the same way as reference a normal (volatile) object. The paper presents the implementation of the concept, carefully examines its soundness, and describes compiler and simple architecture support for keeping performance overheads very low.}, journal={2021 ACM/IEEE 48TH ANNUAL INTERNATIONAL SYMPOSIUM ON COMPUTER ARCHITECTURE (ISCA 2021)}, author={Ye, Chencheng and Xu, Yuanchao and Shen, Xipeng and Liao, Xiaofei and Jin, Hai and Solihin, Yan}, year={2021}, pages={443–455} } @article{zhang_xu_shen_dillig_2021, title={UDF to SQL Translation through Compositional Lazy Inductive Synthesis}, volume={5}, ISSN={["2475-1421"]}, url={https://doi.org/10.1145/3485489}, DOI={10.1145/3485489}, abstractNote={ Many data processing systems allow SQL queries that call user-defined functions (UDFs) written in conventional programming languages. While such SQL extensions provide convenience and flexibility to users, queries involving UDFs are not as efficient as their pure SQL counterparts that invoke SQL’s highly-optimized built-in functions. Motivated by this problem, we propose a new technique for translating SQL queries with UDFs to pure SQL expressions. Unlike prior work in this space, our method is not based on syntactic rewrite rules and can handle a much more general class of UDFs. At a high-level, our method is based on counterexample-guided inductive synthesis (CEGIS) but employs a novel compositional strategy that decomposes the synthesis task into simpler sub-problems. However, because there is no universal decomposition strategy that works for all UDFs, we propose a novel lazy inductive synthesis approach that generates a sequence of decompositions that correspond to increasingly harder inductive synthesis problems. Because most realistic UDF-to-SQL translation tasks are amenable to a fine-grained decomposition strategy, our lazy inductive synthesis method scales significantly better than traditional CEGIS. We have implemented our proposed technique in a tool called CLIS for optimizing Spark SQL programs containing Scala UDFs. To evaluate CLIS, we manually study 100 randomly selected UDFs and find that 63 of them can be expressed in pure SQL. Our evaluation on these 63 UDFs shows that CLIS can automatically synthesize equivalent SQL expressions in 92% of the cases and that it can solve 2.4× more benchmarks compared to a baseline that does not use our compositional approach. We also show that CLIS yields an average speed-up of 3.5× for individual UDFs and 1.3× to 3.1× in terms of end-to-end application performance.}, number={OOPSLA}, journal={PROCEEDINGS OF THE ACM ON PROGRAMMING LANGUAGES-PACMPL}, publisher={Association for Computing Machinery (ACM)}, author={Zhang, Guoqiang and Xu, Yuanchao and Shen, Xipeng and Dillig, Isil}, year={2021}, month={Oct} } @article{xu_ye_solihin_shen_2020, title={Hardware-Based Domain Virtualization for Intra-Process Isolation of Persistent Memory Objects}, ISSN={["0884-7495"]}, DOI={10.1109/ISCA45697.2020.00062}, abstractNote={Persistent memory has appealing properties in serving as main memory. While file access is protected by system calls, an attached persistent memory object (PMO) is one load/store away from accidental (or malicious) reads or writes, which may arise from use of just one buggy library. The recent progress in intra-process isolation could potentially protect PMO by enabling a process to partition sensitive data and code into isolated components. However, the existing intra-process isolations (e.g., Intel MPK) support isolation of only up to 16 domains, forming a major barrier for PMO protections. Although there is some recent effort trying to virtualize MPK to circumvent the limit, it suffers large overhead. This paper presents two novel architecture supports, which provide 11 - 52 × higher efficiency while offering the first known domain-based protection for PMOs.}, journal={2020 ACM/IEEE 47TH ANNUAL INTERNATIONAL SYMPOSIUM ON COMPUTER ARCHITECTURE (ISCA 2020)}, author={Xu, Yuanchao and Ye, ChenCheng and Solihin, Yan and Shen, Xipeng}, year={2020}, pages={680–692} } @article{xu_solihin_shen_2020, title={MERR: Improving Security of Persistent Memory Objects via Efficient Memory Exposure Reduction and Randomization}, DOI={10.1145/3373376.3378492}, abstractNote={This paper proposes a new defensive technique for memory, especially useful for long-living objects on Non-Volatile Memory (NVM), or called Persistent Memory objects (PMOs). The method takes a distinctive perspective, trying to reduce memory exposure time by largely shortening the overhead in attaching and detaching PMOs into the memory space. It does it through a novel idea, embedding page table subtrees inside PMOs. The paper discusses the complexities the technique brings, to permission controls and hardware implementations, and provides solutions. Experimental results show that the new technique reduces memory exposure time by 60% with a 5% time overhead (70% with 10.9% overhead). It allows much more frequent address randomizations (shortening the period from seconds to less than 41.4us), offering significant potential for enhancing memory security.}, journal={TWENTY-FIFTH INTERNATIONAL CONFERENCE ON ARCHITECTURAL SUPPORT FOR PROGRAMMING LANGUAGES AND OPERATING SYSTEMS (ASPLOS XXV)}, author={Xu, Yuanchao and Solihin, Yan and Shen, Xipeng}, year={2020}, pages={987–1000} }