@article{fiala_mueller_ferreira_2016, title={FlipSphere: A Software-based DRAM Error Detection and Correction Library for HPC}, ISSN={["1550-6525"]}, DOI={10.1109/ds-rt.2016.27}, abstractNote={Proposed exascale systems will present considerable challenges. In particular, DRAM soft-errors, or bit-flips, are expected to greatly increase due to higher memory densities and near-threshold voltages. To address this challenge, we introduce FlipSphere, a tunable, transparent silent data corruption detection and correction library for HPC applications that is first in its class to use hardware accelerators, such as the Intel Xeon Phi MIC, to increase application resilience. FlipSphere provides comprehensive silent data corruption protection for application memory by implementing on-demand page integrity verification coupled with a software-based error correcting code that allows for automatic error recovery. Using this framework, we demonstrate the trade-offs of dedicating hardware resources for resilience, showing up to 90% of memory may be protected with a 40% slowdown.}, journal={2016 IEEE/ACM 20TH INTERNATIONAL SYMPOSIUM ON DISTRIBUTED SIMULATION AND REAL TIME APPLICATIONS (DS-RT)}, author={Fiala, David and Mueller, Frank and Ferreira, Kurt B.}, year={2016}, pages={19–28} } @article{elliott_kharbas_fiala_mueller_ferreira_engelmann_2012, title={Combining Partial Redundancy and Checkpointing for HPC}, ISSN={["1063-6927"]}, DOI={10.1109/icdcs.2012.56}, abstractNote={Today's largest High Performance Computing (HPC) systems exceed one Petaflops (1015 floating point operations per second) and exascale systems are projected within seven years. But reliability is becoming one of the major challenges faced by exascale computing. With billion-core parallelism, the mean time to failure is projected to be in the range of minutes or hours instead of days. Failures are becoming the norm rather than the exception during execution of HPC applications. Current fault tolerance techniques in HPC focus on reactive ways to mitigate faults, namely via checkpoint and restart (C/R). Apart from storage overheads, C/R-based fault recovery comes at an additional cost in terms of application performance because normal execution is disrupted when checkpoints are taken. Studies have shown that applications running at a large scale spend more than 50% of their total time saving checkpoints, restarting and redoing lost work. Redundancy is another fault tolerance technique, which employs redundant processes performing the same task. If a process fails, a replica of it can take over its execution. Thus, redundant copies can decrease the overall failure rate. The downside of redundancy is that extra resources are required and there is an additional overhead on communication and synchronization. This work contributes a model and analyzes the benefit of C/R in coordination with redundancy at different degrees to minimize the total wallclock time and resources utilization of HPC applications. We further conduct experiments with an implementation of redundancy within the MPI layer on a cluster. Our experimental results confirm the benefit of dual and triple redundancy - but not for partial redundancy - and show a close fit to the model. At ≈ 80, 000 processes, dual redundancy requires twice the number of processing resources for an application but allows two jobs of 128 hours wallclock time to finish within the time of just one job without redundancy. For narrow ranges of processor counts, partial redundancy results in the lowest time. Once the count exceeds ≈ 770, 000, triple redundancy has the lowest overall cost. Thus, redundancy allows one to trade-off additional resource requirements against wallclock time, which provides a tuning knob for users to adapt to resource availabilities.}, journal={2012 IEEE 32ND INTERNATIONAL CONFERENCE ON DISTRIBUTED COMPUTING SYSTEMS (ICDCS)}, author={Elliott, James and Kharbas, Kishor and Fiala, David and Mueller, Frank and Ferreira, Kurt and Engelmann, Christian}, year={2012}, pages={615–626} } @inproceedings{fiala_mueller_engelmann_riesen_ferreira_brightwell_2012, title={Detection and correction of silent data corruption for large-scale high-performance computing}, DOI={10.1109/sc.2012.49}, abstractNote={Faults have become the norm rather than the exception for high-end computing clusters. Exacerbating this situation, some of these faults remain undetected, manifesting themselves as silent errors that allow applications to compute incorrect results. This paper studies the potential for redundancy to detect and correct soft errors in MPI message-passing applications while investigating the challenges inherent to detecting soft errors within MPI applications by providing transparent MPI redundancy. By assuming a model wherein corruption in application data manifests itself by producing differing MPI messages between replicas, we study the best suited protocols for detecting and correcting corrupted MPI messages. Using our fault injector, we observe that even a single error can have profound effects on applications by causing a cascading pattern of corruption which in most cases spreads to all other processes. Results indicate that our consistency protocols can successfully protect applications experiencing even high rates of silent data corruption.}, booktitle={International conference for high performance computing networking}, author={Fiala, D. and Mueller, F. and Engelmann, C. and Riesen, R. and Ferreira, K. and Brightwell, R.}, year={2012} }