@article{dai_dean_wang_gu_lu_2019, title={Hytrace: A Hybrid Approach to Performance Bug Diagnosis in Production Cloud Infrastructures}, volume={30}, ISSN={["1558-2183"]}, DOI={10.1109/TPDS.2018.2858800}, abstractNote={Server applications running inside production cloud infrastructures are prone to various performance problems (e.g., software hang, performance slowdown). When those problems occur, developers often have little clue to diagnose those problems. In this paper, we present Hytrace, a novel hybrid approach to diagnosing performance problems in production cloud infrastructures. Hytrace combines rule-based static analysis and runtime inference techniques to achieve higher bug localization accuracy than pure-static and pure-dynamic approaches for performance bugs. Hytrace does not require source code and can be applied to both compiled and interpreted programs such as C/C++ and Java. We conduct experiments using real performance bugs from seven commonly used server applications in production cloud infrastructures. The results show that our approach can significantly improve the performance bug diagnosis accuracy compared to existing diagnosis techniques.}, number={1}, journal={IEEE TRANSACTIONS ON PARALLEL AND DISTRIBUTED SYSTEMS}, author={Dai, Ting and Dean, Daniel and Wang, Peipei and Gu, Xiaohui and Lu, Shan}, year={2019}, month={Jan}, pages={107–118} } @article{dai_he_gu_lu_wang_2018, title={DScope: Detecting Real-World Data Corruption Hang Bugs in Cloud Server Systems}, DOI={10.1145/3267809.3267844}, abstractNote={Cloud server systems such as Hadoop and Cassandra have enabled many real-world data-intensive applications running inside computing clouds. However, those systems present many data-corruption and performance problems which are notoriously difficult to debug due to the lack of diagnosis information. In this paper, we present DScope, a tool that statically detects data-corruption related software hang bugs in cloud server systems. DScope statically analyzes I/O operations and loops in a software package, and identifies loops whose exit conditions can be affected by I/O operations through returned data, returned error code, or I/O exception handling. After identifying those loops which are prone to hang problems under data corruption, DScope conducts loop bound and loop stride analysis to prune out false positives. We have implemented DScope and evaluated it using 9 common cloud server systems. Our results show that DScope can detect 42 real software hang bugs including 29 newly discovered software hang bugs. In contrast, existing bug detection tools miss detecting most of those bugs.}, journal={PROCEEDINGS OF THE 2018 ACM SYMPOSIUM ON CLOUD COMPUTING (SOCC '18)}, author={Dai, Ting and He, Jingzhu and Gu, Xiaohui and Lu, Shan and Wang, Peipei}, year={2018}, pages={313–325} } @article{wang_stolee_2018, title={How Well Are Regular Expressions Tested in the Wild?}, DOI={10.1145/3236024.3236072}, abstractNote={Developers report testing their regular expressions less than the rest of their code. In this work, we explore how thoroughly tested regular expressions are by examining open source projects. Using standard metrics of coverage, such as line and branch coverage, gives an incomplete picture of the test coverage of regular expressions. We adopt graph-based coverage metrics for the DFA representation of regular expressions, providing fine-grained test coverage metrics. Using over 15,000 tested regular expressions in 1,225 Java projects on GitHub, we measure node, edge, and edge-pair coverage. Our results show that only 17% of the regular expressions in the repositories are tested at all. For those that are tested, the median number of test inputs is two. For nearly 42% of the tested regular expressions, only one test input is used. Average node and edge coverage levels on the DFAs for tested regular expressions are 59% and 29%, respectively. Due to the lack of testing of regular expressions, we explore whether a string generation tool for regular expressions, Rex, achieves high coverage levels. With some exceptions, we found that tools such as Rex can be used to write test inputs with similar coverage to the developer tests.}, journal={ESEC/FSE'18: PROCEEDINGS OF THE 2018 26TH ACM JOINT MEETING ON EUROPEAN SOFTWARE ENGINEERING CONFERENCE AND SYMPOSIUM ON THE FOUNDATIONS OF SOFTWARE ENGINEERING}, author={Wang, Peipei and Stolee, Kathryn T.}, year={2018}, pages={668–678} } @article{dai_dean_wang_gu_lu_2017, title={Hytrace: A Hybrid Approach to Performance Bug Diagnosis in Production Cloud Infrastructures}, DOI={10.1145/3127479.3132562}, abstractNote={Server applications running inside production cloud infrastructures are prone to various performance problems (e.g., software hang, performance slowdown). When those problems occur, developers often have little clue to diagnose those problems. In this paper, we present Hytrace, a novel hybrid approach to diagnosing performance problems in production cloud infrastructures. Hytrace combines rule-based static analysis and runtime inference techniques to achieve higher bug localization accuracy than pure-static and pure-dynamic approaches for performance bugs. Hytrace does not require source code and can be applied to both compiled and interpreted programs such as C/C++ and Java. We conduct experiments using real performance bugs from seven commonly used server applications in production cloud infrastructures. The results show that our approach can significantly improve the performance bug diagnosis accuracy compared to existing diagnosis techniques.}, journal={PROCEEDINGS OF THE 2017 SYMPOSIUM ON CLOUD COMPUTING (SOCC '17)}, author={Dai, Ting and Dean, Daniel and Wang, Peipei and Gu, Xiaohui and Lu, Shan}, year={2017}, pages={641–641} } @article{shu_wang_gorski_andow_nadkarni_deshotels_gionta_enck_gu_2016, title={A Study of Security Isolation Techniques}, volume={49}, ISSN={["1557-7341"]}, DOI={10.1145/2988545}, abstractNote={Security isolation is a foundation of computing systems that enables resilience to different forms of attacks. This article seeks to understand existing security isolation techniques by systematically classifying different approaches and analyzing their properties. We provide a hierarchical classification structure for grouping different security isolation techniques. At the top level, we consider two principal aspects: mechanism and policy. Each aspect is broken down into salient dimensions that describe key properties. We break the mechanism into two dimensions, enforcement location and isolation granularity, and break the policy aspect down into three dimensions: policy generation, policy configurability, and policy lifetime. We apply our classification to a set of representative articles that cover a breadth of security isolation techniques and discuss tradeoffs among different design choices and limitations of existing approaches.}, number={3}, journal={ACM COMPUTING SURVEYS}, publisher={ACM}, author={Shu, Rui and Wang, Peipei and Gorski, Sigmund A. and Andow, Benjamin and Nadkarni, Adwait and Deshotels, Luke and Gionta, Jason and Enck, William and Gu, Xiaohui}, year={2016}, month={Dec} } @article{dean_nguyen_wang_gu_sailer_kochut_2016, title={PerfCompass: Online Performance Anomaly Fault Localization and Inference in Infrastructure-as-a-Service Clouds}, volume={27}, ISSN={["1558-2183"]}, DOI={10.1109/tpds.2015.2444392}, abstractNote={Infrastructure-as-a-service clouds are becoming widely adopted. However, resource sharing and multi-tenancy have made performance anomalies a top concern for users. Timely debugging those anomalies is paramount for minimizing the performance penalty for users. Unfortunately, this debugging often takes a long time due to the inherent complexity and sharing nature of cloud infrastructures. When an application experiences a performance anomaly, it is important to distinguish between faults with a global impact and faults with a local impact as the diagnosis and recovery steps forfaults with a global impact or local impact are quite different. In this paper, we present PerfCompass, an online performance anomaly fault debugging tool that can quantify whether a production-run performance anomaly has a global impact or local impact. PerfCompass can use this information to suggest the root cause as either an external fault (e.g., environment-based) or an internal fault (e.g., software bugs). Furthermore, PerfCompass can identify top affected system calls to provide useful diagnostic hints for detailed performance debugging. PerfCompass does not require source code or runtime application instrumentation, which makes it practical for production systems. We have tested PerfCompass by running five common open source systems (e.g., Apache, MySQL, Tomcat, Hadoop, Cassandra) inside a virtualized cloud testbed. Our experiments use a range of common infrastructure sharing issues and real software bugs. The results show that PerfCompass accurately classifies 23 out of the 24 tested cases without calibration and achieves 100 percent accuracy with calibration. PerfCompass provides useful diagnosis hints within several minutes and imposes negligible runtime overhead to the production system during normal execution time.}, number={6}, journal={IEEE TRANSACTIONS ON PARALLEL AND DISTRIBUTED SYSTEMS}, author={Dean, Daniel J. and Nguyen, Hiep and Wang, Peipei and Gu, Xiaohui and Sailer, Anca and Kochut, Andrzej}, year={2016}, month={Jun}, pages={1742–1755} } @inproceedings{wang_nguyen_gu_lu_2016, title={RDE: Replay DEbugging for Diagnosing Production Site Failures}, DOI={10.1109/srds.2016.050}, abstractNote={Online service failures in production computing environments are notoriously difficult to debug. One of the key challenges is to allow the developer to replay the failure execution within an interactive debugging tool such as GDB. Previous work has proposed in-situ approaches to inferring the production-run failure path within the production environment. However, those tools may sometimes suggest failure execution paths that are infeasible to reach by any program inputs. Moreover, production site often does not record or provide failure-triggering inputs due to the user privacy concern. In this paper, we present RDE, a Replay DEbug system that can replay a production-site failure at the development site within an interactive debugging environment without requiring user inputs. RDE takes an inferred production failure path as input and performs execution synthesis using a new guided symbolic execution technique. RDE can tolerate imprecise or inaccurate failure path information by navigating the symbolic execution along a set of selected paths. RDE synthesizes an input from the selected symbolic execution path which can be fed to a debugging tool to replay the failure. We have implemented an initial prototype of RDE and tested it with a set of coreutils bugs. The results show that RDE can successfully replay all the tested bugs within GDB.}, booktitle={Proceedings of 2016 ieee 35th symposium on reliable distributed systems (srds)}, author={Wang, P. P. and Nguyen, H. and Gu, X. H. and Lu, S.}, year={2016}, pages={327–336} } @article{dean_wang_gu_enck_jin_2015, title={Automatic Server Hang Bug Diagnosis: Feasible Reality or Pipe Dream?}, DOI={10.1109/icac.2015.52}, abstractNote={It is notoriously difficult to diagnose server hang bugs as they often generate little diagnostic information and are difficult to reproduce offline. In this paper, we present a characteristic study of 177 real software hang bugs from 8 common open source server systems (i.e., Apache, Lighttpd, My SQL, Squid, HDFS, Hadoop Mapreduce, Tomcat, Cassandra). We identify three major root cause categories (i.e., Programmer errors, mishandled values, concurrency issues). We then describe two major problems (i.e., False positives and false negatives) while applying existing rule-based bug detection techniques to those bugs.}, journal={2015 IEEE INTERNATIONAL CONFERENCE ON AUTONOMIC COMPUTING}, author={Dean, Daniel J. and Wang, Peipei and Gu, Xiaohui and Enck, William and Jin, Guoliang}, year={2015}, pages={127–132} } @article{wang_dean_gu_2015, title={Understanding Real World Data Corruptions in Cloud Systems}, DOI={10.1109/ic2e.2015.41}, abstractNote={Big data processing is one of the killer applications for cloud systems. MapReduce systems such as Hadoop are the most popular big data processing platforms used in the cloud system. Data corruption is one of the most critical problems in cloud data processing, which not only has serious impact on the integrity of individual application results but also affects the performance and availability of the whole data processing system. In this paper, we present a comprehensive study on 138 real world data corruption incidents reported in Hadoop bug repositories. We characterize those data corruption problems in four aspects: 1) what impact can data corruption have on the application and system? 2) how is data corruption detected? 3) what are the causes of the data corruption? and 4) what problems can occur while attempting to handle data corruption? Our study has made the following findings: 1) the impact of data corruption is not limited to data integrity, 2) existing data corruption detection schemes are quite insufficient: only 25% of data corruption problems are correctly reported, 42% are silent data corruption without any error message, and 21% receive imprecise error report. We also found the detection system raised 12% false alarms, 3) there are various causes of data corruption such as improper runtime checking, race conditions, inconsistent block states, improper network failure handling, and improper node crash handling, and 4) existing data corruption handling mechanisms (i.e., data replication, replica deletion, simple re-execution) make frequent mistakes including replicating corrupted data blocks, deleting uncorrupted data blocks, or causing undesirable resource hogging.}, journal={2015 IEEE INTERNATIONAL CONFERENCE ON CLOUD ENGINEERING (IC2E 2015)}, author={Wang, Peipei and Dean, Daniel J. and Gu, Xiaohui}, year={2015}, pages={116–125} }