@article{nguyen_shen_tan_gu_2013, title={FChain: Toward Black-box Online Fault Localization for Cloud Systems}, ISSN={["1063-6927"]}, DOI={10.1109/icdcs.2013.26}, abstractNote={Distributed applications running inside cloud systems are prone to performance anomalies due to various reasons such as resource contentions, software bugs, and hardware failures. One big challenge for diagnosing an abnormal distributed application is to pinpoint the faulty components. In this paper, we present a black-box online fault localization system called FChain that can pinpoint faulty components immediately after a performance anomaly is detected. FChain first discovers the onset time of abnormal behaviors at different components by distinguishing the abnormal change point from many change points caused by normal workload fluctuations. Faulty components are then pinpointed based on the abnormal change propagation patterns and inter-component dependency relationships. FChain performs runtime validation to further filter out false alarms. We have implemented FChain on top of the Xen platform and tested it using several benchmark applications (RUBiS, Hadoop, and IBM System S). Our experimental results show that FChain can quickly pinpoint the faulty components with high accuracy within a few seconds. FChain can achieve up to 90% higher precision and 20% higher recall than existing schemes. FChain is non-intrusive and light-weight, which imposes less than 1% overhead to the cloud system.}, journal={2013 IEEE 33RD INTERNATIONAL CONFERENCE ON DISTRIBUTED COMPUTING SYSTEMS (ICDCS)}, author={Nguyen, Hiep and Shen, Zhiming and Tan, Yongmin and Gu, Xiaohui}, year={2013}, pages={21–30} } @article{wu_shen_wu_lin_2012, title={Jump-start cloud: Efficient deployment framework for large-scale cloud applications}, volume={24}, number={17}, journal={Concurrency and Computation-Practice & Experience}, author={Wu, X. X. and Shen, Z. M. and Wu, R. and Lin, Y. F.}, year={2012}, pages={2120–2137} } @article{tan_nguyen_shen_gu_venkatramani_rajan_2012, title={PREPARE: Predictive Performance Anomaly Prevention for Virtualized Cloud Systems}, ISSN={["1063-6927"]}, DOI={10.1109/icdcs.2012.65}, abstractNote={Virtualized cloud systems are prone to performance anomalies due to various reasons such as resource contentions, software bugs, and hardware failures. In this paper, we present a novel Predictive Performance Anomaly Prevention (PREPARE) system that provides automatic performance anomaly prevention for virtualized cloud computing infrastructures. PREPARE integrates online anomaly prediction, learning-based cause inference, and predictive prevention actuation to minimize the performance anomaly penalty without human intervention. We have implemented PREPARE on top of the Xen platform and tested it on the NCSU's Virtual Computing Lab using a commercial data stream processing system (IBM System S) and an online auction benchmark (RUBiS). The experimental results show that PREPARE can effectively prevent performance anomalies while imposing low overhead to the cloud infrastructure.}, journal={2012 IEEE 32ND INTERNATIONAL CONFERENCE ON DISTRIBUTED COMPUTING SYSTEMS (ICDCS)}, author={Tan, Yongmin and Nguyen, Hiep and Shen, Zhiming and Gu, Xiaohui and Venkatramani, Chitra and Rajan, Deepak}, year={2012}, pages={285–294} }