@article{scott_engelmann_vallee_naughton_tikotekar_ostrouchov_leangsuksun_naksinehaboon_nassar_paun_et al._2009, title={A Tunable Holistic Resiliency Approach for High-Performance Computing Systems}, volume={44}, ISSN={["1558-1160"]}, DOI={10.1145/1594835.1504227}, abstractNote={In order to address anticipated high failure rates, resiliency characteristics have become an urgent priority for next-generation extreme-scale high-performance computing (HPC) systems. This poster describes our past and ongoing efforts in novel fault resilience technologies for HPC. Presented work includes proactive fault resilience techniques, system and application reliability models and analyses, failure prediction, transparent process- and virtual-machine-level migration, and trade-off models for combining preemptive migration with checkpoint/restart. This poster summarizes our work and puts all individual technologies into context with a proposed holistic fault resilience framework.}, number={4}, journal={ACM SIGPLAN NOTICES}, author={Scott, Stephen L. and Engelmann, Christian and Vallee, Geoffroy R. and Naughton, Thomas and Tikotekar, Anand and Ostrouchov, George and Leangsuksun, Chokchai and Naksinehaboon, Nichamon and Nassar, Raja and Paun, Mihaela and et al.}, year={2009}, month={Apr}, pages={305–306} }