@article{elnawawy_tuck_byrd_2023, title={PreFlush: Lightweight Hardware Prediction Mechanism for Cache Line Flush and Writeback}, ISSN={["1089-795X"]}, DOI={10.1109/PACT58117.2023.00015}, abstractNote={Non-Volatile Main Memory (NVMM) technologies make it possible for applications to permanently store data in memory. To do so, they need to make sure that updates to persistent data comply with the crash consistency model, which often involves explicitly flushing a dirty cache line after a store and then waiting for the flush operation to complete using a store fence. While cache line flush and write back instructions can complete in the background, fence instructions expose the latency of flushing to the critical path of the program's execution, incurring significant overheads. If flush operations are started earlier, the penalty of fences can be significantly reduced. We propose PreFlush, a lightweight and transparent hardware mechanism that predicts when a cache line flush or write back is needed and speculatively performs the operation early. Since we speculatively perform the flush, we add hardware to handle flush misspeculation to ensure correct execution of the code without the need for any complex recovery mechanisms. Our PreFlush design is transparent to the programmer (i.e. it requires no modification on existing NVMM-enabled code). Our results show that PreFlush can improve performance by up to 25% (15.7% average) for the WHISPER NVM benchmark suite and loop-based matrix microbenchmarks.}, journal={2023 32ND INTERNATIONAL CONFERENCE ON PARALLEL ARCHITECTURES AND COMPILATION TECHNIQUES, PACT}, author={Elnawawy, Hussein and Tuck, James and Byrd, Gregory T.}, editor={Tuck, James and Byrd, GregoryEditors}, year={2023}, pages={74–85} } @article{alshboul_elnawawy_elkhouly_kimura_tuck_solihin_2019, title={Efficient Checkpointing with Recompute Scheme for Non-volatile Main Memory}, volume={16}, ISSN={["1544-3973"]}, DOI={10.1145/3323091}, abstractNote={Future main memory will likely include Non-Volatile Memory. Non-Volatile Main Memory (NVMM) provides an opportunity to rethink checkpointing strategies for providing failure safety to applications. While there are many checkpointing and logging schemes in the literature, their use must be revisited as they incur high execution time overheads as well as a large number of additional writes to NVMM, which may significantly impact write endurance. In this article, we propose a novel recompute-based failure safety approach and demonstrate its applicability to loop-based code. Rather than keeping a fully consistent logging state, we only log enough state to enable recomputation. Upon a failure, our approach recovers to a consistent state by determining which parts of the computation were not completed and recomputing them. Effectively, our approach removes the need to keep checkpoints or logs, thus reducing execution time overheads and improving NVMM write endurance at the expense of more complex recovery. We compare our new approach against logging and checkpointing on five scientific workloads, including tiled matrix multiplication, on a computer system model that was built on gem5 and supports Intel PMEM instruction extensions. For tiled matrix multiplication, our recompute approach incurs an execution time overhead of only 5%, in contrast to 8% overhead with logging and 207% overhead with checkpointing. Furthermore, recompute only adds 7% additional NVMM writes, compared to 111% with logging and 330% with checkpointing. We also conduct experiments on real hardware, allowing us to run our workloads to completion while varying the number of threads used for computation. These experiments substantiate our simulation-based observations and provide a sensitivity study and performance comparison between the Recompute Scheme and Naive Checkpointing.}, number={2}, journal={ACM TRANSACTIONS ON ARCHITECTURE AND CODE OPTIMIZATION}, author={Alshboul, Mohammad and Elnawawy, Hussein and Elkhouly, Reem and Kimura, Keiji and Tuck, James and Solihin, Yan}, year={2019}, month={May} }