@article{das_mueller_rountree_2021, title={Systemic Assessment of Node Failures in HPC Production Platforms}, ISSN={["1530-2075"]}, DOI={10.1109/IPDPS49936.2021.00035}, abstractNote={Production HPC clusters endure failures reducing computational capability and resource availability. Despite the presence of various failure prediction schemes for large-scale computing systems, a comprehensive understanding of how nodes fail considering various components and layers of the system is required for sustained resilience. This work performs a holistic diagnosis of node failures using a measurement-driven approach on contemporary system logs that can help vendors and system administrators support exascale resilience.Our work shows that external environmental influence is not strongly correlated with node failures in terms of the root cause. Though hardware and software faults trigger failures, the underlying root cause often lies in the application malfunctioning causing the system to fail. Furthermore, lead time enhancements are feasible for nodes showing fail slow characteristics. This study excavates such helpful empirical observations, which could facilitate better failure handling in production systems.}, journal={2021 IEEE 35TH INTERNATIONAL PARALLEL AND DISTRIBUTED PROCESSING SYMPOSIUM (IPDPS)}, author={Das, Anwesha and Mueller, Frank and Rountree, Barry}, year={2021}, pages={267–276} } @article{das_mueller_rountree_2020, title={Aarohi: Making Real-Time Node Failure Prediction Feasible}, ISSN={["1530-2075"]}, DOI={10.1109/IPDPS47924.2020.00115}, abstractNote={Large-scale production systems are well known to encounter node failures, which affect compute capacity and energy. Both in HPC systems and enterprise data centers, combating failures is becoming challenging with increasing hardware and software complexity. Several data mining solutions of logs have been investigated in the context of anomaly detection in such systems. However, with subsequent proactive failure mitigation, the existing log mining solutions are not sufficiently fast for real-time anomaly detection. Machine learning (ML)-based training can produce high accuracy but the inference scheme needs to be enhanced with rapid parsers to assess anomalies in real-time. This work tackles online anomaly prediction in computing systems by exploiting context free grammar-based rapid event analysis.We present our framework Aarohi1, which describes an effective way to predict failures online. Aarohi is designed to be generic and scalable making it suitable as a real-time predictor. Aarohi obtains more than 3 minutes lead times to node failures with an average of 0.31 msecs prediction time for a chain length of 18. The overall improvement obtained w.r.t. the existing state-of-the-art is over a factor of 27.4×. Our compiler-based approach provides new research directions for lead time optimization with a significant prediction speedup required for the deployment of proactive fault tolerant solutions in practice.}, journal={2020 IEEE 34TH INTERNATIONAL PARALLEL AND DISTRIBUTED PROCESSING SYMPOSIUM IPDPS 2020}, author={Das, Anwesha and Mueller, Frank and Rountree, Barry}, year={2020}, pages={1092–1101} } @article{das_mueller_siegel_vishnu_2018, title={Desh: Deep Learning for System Health Prediction of Lead Times to Failure in HPC}, DOI={10.1145/3208040.3208051}, abstractNote={Today's large-scale supercomputers encounter faults on a daily basis. Exascale systems are likely to experience even higher fault rates due to increased component count and density. Triggering resilience-mitigating techniques remains a challenge due to the absence of well defined failure indicators. System logs consist of unstructured text that obscures essential system health information contained within. In this context, efficient failure prediction via log mining can enable proactive recovery mechanisms to increase reliability. This work aims to predict node failures that occur in supercomputing systems via long short-term memory (LSTM) networks that exploit recurrent neural networks (RNNs). Our framework, Desh1 (Deep Learning for System Health), diagnoses and predicts failures with short lead times. Desh identifies failure indicators with enhanced training and classification for generic applicability to logs from operating systems and software components without the need to modify any of them. Desh uses a novel three-phase deep learning approach to (1) train to recognize chains of log events leading to a failure, (2) re-train chain recognition of events augmented with expected lead times to failure, and (3) predict lead times during testing/inference deployment to predict which specific node fails in how many minutes. Desh obtains as high as 3 minutes average lead time with no less than 85% recall and 83% accuracy to take proactive actions on the failing nodes, which could be used to migrate computation to healthy nodes.}, journal={HPDC '18: PROCEEDINGS OF THE 27TH INTERNATIONAL SYMPOSIUM ON HIGH-PERFORMANCE PARALLEL AND DISTRIBUTED COMPUTING}, author={Das, Anwesha and Mueller, Frank and Siegel, Charles and Vishnu, Abhinav}, year={2018}, pages={40–51} } @article{das_iyengar_mueller_2018, title={KeyValueServe(dagger): Design and performance analysis of a multi-tenant data grid as a cloud service}, volume={30}, ISSN={["1532-0634"]}, DOI={10.1002/cpe.4424}, abstractNote={Summary}, number={14}, journal={CONCURRENCY AND COMPUTATION-PRACTICE & EXPERIENCE}, author={Das, Anwesha and Iyengar, Arun and Mueller, Frank}, year={2018}, month={Jul} } @inproceedings{das_mueller_gu_iyengar_2016, title={Performance analysis of a multi-tenant in-memory data grid}, DOI={10.1109/cloud.2016.0144}, abstractNote={Distributed key-value stores have become indispensable for large scale low latency applications. Many cloud services have deployed in-memory data grids for their enterprise infrastructures and support multi-tenancy services. But it is still difficult to provide consistent performance to all tenants for fluctuating workloads that need to scale out. Many popular key-value stores suffer from performance problems at scale and different tenant requirements. To this front, we present our study with Hazelcast, a popular open source data grid, and provide insights to contention and performance bottlenecks. Through experimental analysis, this paper uncovers scenarios of performance degradation followed by optimized performance via end-point multiplexing. Our study suggests that processing increasing number of client requests spawning fewer number of threads help improve performance.}, booktitle={Proceedings of 2016 ieee 9th international conference on cloud computing (cloud)}, author={Das, A. and Mueller, F. and Gu, X. H. and Iyengar, A.}, year={2016}, pages={956–959} }