@article{dean_nguyen_wang_gu_sailer_kochut_2016, title={PerfCompass: Online Performance Anomaly Fault Localization and Inference in Infrastructure-as-a-Service Clouds}, volume={27}, ISSN={["1558-2183"]}, DOI={10.1109/tpds.2015.2444392}, abstractNote={Infrastructure-as-a-service clouds are becoming widely adopted. However, resource sharing and multi-tenancy have made performance anomalies a top concern for users. Timely debugging those anomalies is paramount for minimizing the performance penalty for users. Unfortunately, this debugging often takes a long time due to the inherent complexity and sharing nature of cloud infrastructures. When an application experiences a performance anomaly, it is important to distinguish between faults with a global impact and faults with a local impact as the diagnosis and recovery steps forfaults with a global impact or local impact are quite different. In this paper, we present PerfCompass, an online performance anomaly fault debugging tool that can quantify whether a production-run performance anomaly has a global impact or local impact. PerfCompass can use this information to suggest the root cause as either an external fault (e.g., environment-based) or an internal fault (e.g., software bugs). Furthermore, PerfCompass can identify top affected system calls to provide useful diagnostic hints for detailed performance debugging. PerfCompass does not require source code or runtime application instrumentation, which makes it practical for production systems. We have tested PerfCompass by running five common open source systems (e.g., Apache, MySQL, Tomcat, Hadoop, Cassandra) inside a virtualized cloud testbed. Our experiments use a range of common infrastructure sharing issues and real software bugs. The results show that PerfCompass accurately classifies 23 out of the 24 tested cases without calibration and achieves 100 percent accuracy with calibration. PerfCompass provides useful diagnosis hints within several minutes and imposes negligible runtime overhead to the production system during normal execution time.}, number={6}, journal={IEEE TRANSACTIONS ON PARALLEL AND DISTRIBUTED SYSTEMS}, author={Dean, Daniel J. and Nguyen, Hiep and Wang, Peipei and Gu, Xiaohui and Sailer, Anca and Kochut, Andrzej}, year={2016}, month={Jun}, pages={1742–1755} } @inproceedings{wang_nguyen_gu_lu_2016, title={RDE: Replay DEbugging for Diagnosing Production Site Failures}, DOI={10.1109/srds.2016.050}, abstractNote={Online service failures in production computing environments are notoriously difficult to debug. One of the key challenges is to allow the developer to replay the failure execution within an interactive debugging tool such as GDB. Previous work has proposed in-situ approaches to inferring the production-run failure path within the production environment. However, those tools may sometimes suggest failure execution paths that are infeasible to reach by any program inputs. Moreover, production site often does not record or provide failure-triggering inputs due to the user privacy concern. In this paper, we present RDE, a Replay DEbug system that can replay a production-site failure at the development site within an interactive debugging environment without requiring user inputs. RDE takes an inferred production failure path as input and performs execution synthesis using a new guided symbolic execution technique. RDE can tolerate imprecise or inaccurate failure path information by navigating the symbolic execution along a set of selected paths. RDE synthesizes an input from the selected symbolic execution path which can be fed to a debugging tool to replay the failure. We have implemented an initial prototype of RDE and tested it with a set of coreutils bugs. The results show that RDE can successfully replay all the tested bugs within GDB.}, booktitle={Proceedings of 2016 ieee 35th symposium on reliable distributed systems (srds)}, author={Wang, P. P. and Nguyen, H. and Gu, X. H. and Lu, S.}, year={2016}, pages={327–336} } @article{tan_nguyen_shen_gu_venkatramani_rajan_2012, title={PREPARE: Predictive Performance Anomaly Prevention for Virtualized Cloud Systems}, ISSN={["1063-6927"]}, DOI={10.1109/icdcs.2012.65}, abstractNote={Virtualized cloud systems are prone to performance anomalies due to various reasons such as resource contentions, software bugs, and hardware failures. In this paper, we present a novel Predictive Performance Anomaly Prevention (PREPARE) system that provides automatic performance anomaly prevention for virtualized cloud computing infrastructures. PREPARE integrates online anomaly prediction, learning-based cause inference, and predictive prevention actuation to minimize the performance anomaly penalty without human intervention. We have implemented PREPARE on top of the Xen platform and tested it on the NCSU's Virtual Computing Lab using a commercial data stream processing system (IBM System S) and an online auction benchmark (RUBiS). The experimental results show that PREPARE can effectively prevent performance anomalies while imposing low overhead to the cloud infrastructure.}, journal={2012 IEEE 32ND INTERNATIONAL CONFERENCE ON DISTRIBUTED COMPUTING SYSTEMS (ICDCS)}, author={Tan, Yongmin and Nguyen, Hiep and Shen, Zhiming and Gu, Xiaohui and Venkatramani, Chitra and Rajan, Deepak}, year={2012}, pages={285–294} }