@inproceedings{patil_mueller_ionkov_lee_lang_2020, place={New York}, title={Symbiotic HW Cache and SW DTLB Prefetching for DRAM/NVM Hybrid Memory}, ISBN={9781728192383}, url={http://dx.doi.org/10.1109/MASCOTS50786.2020.9285963}, DOI={10.1109/MASCOTS50786.2020.9285963}, abstractNote={The introduction of NVDIMM memory devices has encouraged the use of DRAM/NVM based hybrid memory systems to increase the memory-per-core ratio in compute nodes and obtain possible energy and cost benefits. However, Non-Volatile Memory (NVM) is slower than DRAM in terms of read/write latency. This difference in performance will adversely affect memory-bound applications. Traditionally, data prefetching at the hardware level has been used to increase the number of cache hits to mitigate performance degradation. However, software (SW) prefetching has not been used effectively to reduce the effects of high memory access latencies. Also, the current cache hierarchy and hardware (HW) prefetching are not optimized for a hybrid memory system. We hypothesize that HW and SW prefetching can complement each other in placing data in caches and the Data Translation Look-aside Buffer (DTLB) prior to their references, and by doing so adaptively, highly varying access latencies in a DRAM/NVM hybrid memory system are taken into account. This work contributes an adaptive SW prefetch method based on the characterization of read/write/unroll prefetch distances for NVM and DRAM. Prefetch performance is characterized via custom benchmarks based on STREAM2 specifications in a multicore MPI runtime environment and compared to the performance of the standard SW prefetch pass in GCC. Furthermore, the effects of HW prefetching on kernels executing on hybrid memory system are evaluated. Experimental results indicate that SW prefetching targeted to populate the DTLB results in up to 26% performance improvement when symbiotically used in conjunction with HW prefetching, as opposed to only HW prefetching. Based on our findings, changes to GCC's prefetch-loop-arrays compiler pass are proposed to take advantage of DTLB prefetching in a hybrid memory system for kernels that are frequently used in HPC applications.}, booktitle={2020 28th International Symposium on Modeling, Analysis, and Simulation of Computer and Telecommunication Systems (MASCOTS)}, publisher={IEEE}, author={Patil, Onkar and Mueller, Frank and Ionkov, Latchesar and Lee, Jason and Lang, Michael}, year={2020}, month={Nov}, pages={1–8} } @article{rezaei_khetawat_patil_mueller_hargrove_roman_2019, title={End-to-End Resilience for HPC Applications}, volume={11501}, ISBN={["978-3-030-20655-0"]}, ISSN={["1611-3349"]}, DOI={10.1007/978-3-030-20656-7_14}, abstractNote={A plethora of resilience techniques have been investigated to protect application kernels. If, however, such techniques are combined and they interact across kernels, new vulnerability windows are created. This work contributes the idea of end-to-end resilience by protecting windows of vulnerability between kernels guarded by different resilience techniques. It introduces the live vulnerability factor (LVF), a new metric that quantifies any lack of end-to-end protection for a given data structure. The work further promotes end-to-end application protection across kernels via a pragma-based specification for diverse resilience schemes with minimal programming effort. This lifts the data protection burden from application programmers allowing them to focus solely on algorithms and performance while resilience is specified and subsequently embedded into the code through the compiler/library and supported by the runtime system. In experiments with case studies and benchmarks, end-to-end resilience has an overhead over kernel-specific resilience of less than $$3\%$$ on average and increases protection against bit flips by a factor of three to four.}, journal={HIGH PERFORMANCE COMPUTING, ISC HIGH PERFORMANCE 2019}, author={Rezaei, Arash and Khetawat, Harsh and Patil, Onkar and Mueller, Frank and Hargrove, Paul and Roman, Eric}, year={2019}, pages={271–290} } @inproceedings{patil_ionkov_lee_mueller_lang_2019, title={Performance characterization of a DRAM-NVM hybrid memory architecture for HPC applications using intel optane DC persistent memory modules}, ISBN={9781450372060}, url={http://dx.doi.org/10.1145/3357526.3357541}, DOI={10.1145/3357526.3357541}, abstractNote={Non-volatile, byte-addressable memory (NVM) has been introduced by Intel in the form of NVDIMMs named Intel® Optane™ DC PMM. This memory module has the ability to persist the data stored in it without the need for power. This expands the memory hierarchy into a hybrid memory system due the differences in access latency and memory bandwidth from DRAM, which has been the predominant byte-addressable main memory technology. The Optane DC memory modules have up to 8x the capacity of DDR4 DRAM modules which can expand the byte-address space up to 6 TB per node. Many applications can now scale up the their problem size given such a memory system. We evaluate the capabilities of this DRAM-NVM hybrid memory system and its impact on High Performance Computing (HPC) applications. We characterize the Optane DC in comparison to DDR4 DRAM with a STREAM-like custom benchmark and measure the performance for HPC mini-apps like VPIC, SNAP, LULESH and AMG under different configurations of Optane DC PMMs. We find that Optane-only executions are slower in terms of execution time than DRAM-only and Memory-mode executions by a minimum of 2 to 16% for VPIC and maximum of 6x for LULESH.}, booktitle={Proceedings of the International Symposium on Memory Systems - MEMSYS '19}, publisher={ACM Press}, author={Patil, Onkar and Ionkov, Latchesar and Lee, Jason and Mueller, Frank and Lang, Michael}, year={2019} } @inproceedings{yagna_patil_mueller_2016, title={Efficient and predictable group communication for manycore NoCs}, volume={9697}, booktitle={High performance computing}, author={Yagna, K. and Patil, O. and Mueller, F.}, year={2016}, pages={383–403} }