@article{li_zhang_lu_gunawi_gu_huang_li_2023, title={Performance Bug Analysis and Detection for Distributed Storage and Computing Systems}, volume={19}, ISSN={["1553-3093"]}, DOI={10.1145/3580281}, abstractNote={ This article systematically studies 99 distributed performance bugs from five widely deployed distributed storage and computing systems (Cassandra, HBase, HDFS, Hadoop MapReduce and ZooKeeper). We present the TaxPerf database, which collectively organizes the analysis results as over 400 classification labels and over 2,500 lines of bug re-description. TaxPerf is classified into six bug categories (and 18 bug subcategories) by their root causes; resource, blocking, synchronization, optimization, configuration, and logic. TaxPerf can be used as a benchmark for performance bug studies and debug tool designs. Although it is impractical to automatically detect all categories of performance bugs in TaxPerf, we find that an important category of blocking bugs can be effectively solved by analysis tools. We analyze the cascading nature of blocking bugs and design an automatic detection tool called PCatch , which (i) performs program analysis to identify code regions whose execution time can potentially increase dramatically with the workload size; (ii) adapts the traditional happens-before model to reason about software resource contention and performance dependency relationship; and (iii) uses dynamic tracking to identify whether the slowdown propagation is contained in one job. Evaluation shows that PCatch can accurately detect blocking bugs of representative distributed storage and computing systems by observing system executions under small-scale workloads. }, number={3}, journal={ACM TRANSACTIONS ON STORAGE}, author={Li, Jiaxin and Zhang, Yiming and Lu, Shan and Gunawi, Haryadi S. and Gu, Xiaohui and Huang, Feng and Li, Dongsheng}, year={2023}, month={Aug} } @article{he_lin_gu_yeh_zhuang_2022, title={PerfSig: Extracting Performance Bug Signatures via Multi-modality Causal Analysis}, ISSN={["0270-5257"]}, DOI={10.1145/3510003.3510110}, abstractNote={Diagnosing a performance bug triggered in production cloud environments is notoriously challenging. Extracting performance bug signatures can help cloud operators quickly pinpoint the problem and avoid repeating manual efforts for diagnosing similar performance bugs. In this paper, we present PerfSig, a multi-modality performance bug signature extraction tool which can identify principal anomaly patterns and root cause functions for performance bugs. PerfSig performs fine-grained anomaly detection over various machine data such as system metrics, system logs, and function call traces. We then conduct causal analysis across different machine data using information theory method to pinpoint the root cause function of a performance bug. PerfSig generates bug signatures as the combination of the identified anomaly patterns and root cause functions. We have implemented a prototype of PerfSig and conducted evaluation using 20 real world performance bugs in six commonly used cloud systems. Our experimental results show that PerfSig captures various kinds of fine-grained anomaly patterns from different machine data and successfully identifies the root cause functions through multi-modality causal analysis for 19 out of 20 tested performance bugs.}, journal={2022 ACM/IEEE 44TH INTERNATIONAL CONFERENCE ON SOFTWARE ENGINEERING (ICSE 2022)}, author={He, Jingzhu and Lin, Yuhang and Gu, Xiaohui and Yeh, Chin-Chia Michael and Zhuang, Zhongfang}, year={2022}, pages={1669–1680} } @article{lin_tunde-onadele_gu_he_latapie_2022, title={SHIL: Self-Supervised Hybrid Learning for Security Attack Detection in Containerized Applications}, DOI={10.1109/ACSOS55765.2022.00022}, abstractNote={Container security has received much research attention recently. Previous work has proposed to apply various machine learning techniques to detect security attacks in containerized applications. On one hand, supervised machine learning schemes require sufficient labelled training data to achieve good attack detection accuracy. On the other hand, unsupervised machine learning methods are more practical by avoiding training data labelling requirements, but they often suffer from high false alarm rates. In this paper, we present SHIL, a self-supervised hybrid learning solution, which combines unsupervised and supervised learning methods to achieve high accuracy without requiring any manual data labelling. We have implemented a prototype of SHIL and conducted experiments over 41 real world security attacks in 28 commonly used server applications. Our experimental results show that SHIL can reduce false alarms by 39-91% compared to existing supervised or unsupervised machine learning schemes while achieving a higher or similar detection rate.}, journal={2022 IEEE INTERNATIONAL CONFERENCE ON AUTONOMIC COMPUTING AND SELF-ORGANIZING SYSTEMS (ACSOS 2022)}, author={Lin, Yuhang and Tunde-Onadele, Olufogorehan and Gu, Xiaohui and He, Jingzhu and Latapie, Hugo}, year={2022}, pages={41–50} } @article{tunde-onadele_lin_gu_he_2022, title={Understanding Software Security Vulnerabilities in Cloud Server Systems}, ISSN={["2373-3845"]}, DOI={10.1109/IC2E55432.2022.00033}, abstractNote={Cloud systems have been widely adopted by many real world production applications. Thus, security vulnerabilities in those cloud systems can cause serious widespread impact. Although previous intrusion detection systems can detect security attacks, understanding the underlying software defects that cause those security vulnerabilities is little studied. In this paper, we conduct a systematic study over 110 software security vulnera-bilities in 13 popular cloud server systems. To understand the underlying vulnerabilities, we answer the following questions: 1) what are the root causes of those security vulnerabilities? 2) what threat impact do those vulnerable code have? 3) how do developers patch those vulnerable code? Our results show that the vulnerable code of the studied security vulnerabilities comprise five common categories: 1) improper execution restrictions, 2) improper permission checks, 3) improper resource path-name checks, 4) improper sensitive data handling, and 5) improper synchronization handling. We further extract principal vulnerable code patterns from those common vulnerability categories.}, journal={2022 IEEE INTERNATIONAL CONFERENCE ON CLOUD ENGINEERING (IC2E 2022)}, author={Tunde-Onadele, Olufogorehan and Lin, Yuhang and Gu, Xiaohui and He, Jingzhu}, year={2022}, pages={245–252} } @article{lin_tunde-onadele_gu_2020, title={CDL: Classified Distributed Learning for Detecting Security Attacks in Containerized Applications}, ISSN={["1063-9527"]}, DOI={10.1145/3427228.3427236}, abstractNote={Containers have been widely adopted in production computing environments for its efficiency and low overhead of isolation. However, recent studies have shown that containerized applications are prone to various security attacks. Moreover, containerized applications are often highly dynamic and short-lived, which further exacerbates the problem. In this paper, we present CDL, a classified distributed learning framework to achieve efficient security attack detection for containerized applications. CDL integrates online application classification and anomaly detection to overcome the challenge of lacking sufficient training data for dynamic short-lived containers while considering diversified normal behaviors in different applications. We have implemented a prototype of CDL and evaluated it over 33 real world vulnerability attacks in 24 commonly used server applications. Our experimental results show that CDL can reduce the false positive rate from over 12% to 0.24% compared to traditional anomaly detection schemes without aggregating training data. By introducing application classification into container behavior learning, CDL can improve the detection rate from catching 20 attacks to 31 attacks before those attacks succeed. CDL is light-weight, which can complete application classification and anomaly detection for each data sample within a few milliseconds.}, journal={36TH ANNUAL COMPUTER SECURITY APPLICATIONS CONFERENCE (ACSAC 2020)}, author={Lin, Yuhang and Tunde-Onadele, Olufogorehan and Gu, Xiaohui}, year={2020}, pages={179–188} } @article{tunde-onadele_lin_he_gu_2020, title={Self-Patch: Beyond Patch Tuesday for Containerized Applications}, DOI={10.1109/ACSOS49614.2020.00022}, abstractNote={Containers have become increasingly popular in distributed computing environments. However, recent studies have shown that containerized applications are susceptible to various security attacks. Traditional periodically scheduled software update approaches not only become ineffective under dynamic container environments but also impose high overhead to containers. In this paper, we present Self-Patch, a new self-triggering patching framework for applications running inside containers. Self-Patch combines light-weight runtime attack detection and dynamic targeted patching to achieve more efficient and effective security protection for containerized applications. We evaluated our schemes over 31 real world vulnerability attacks in 23 commonly used server applications. Results show that Self-Patch can accurately detect and classify 81% of attacks and reduce patching overhead by up to 84%.}, journal={2020 IEEE INTERNATIONAL CONFERENCE ON AUTONOMIC COMPUTING AND SELF-ORGANIZING SYSTEMS (ACSOS 2020)}, author={Tunde-Onadele, Olufogorehan and Lin, Yuhang and He, Jingzhu and Gu, Xiaohui}, year={2020}, pages={21–27} } @article{tunde-onadele_he_dai_gu_2019, title={A Study on Container Vulnerability Exploit Detection}, ISSN={["2373-3845"]}, DOI={10.1109/IC2E.2019.00026}, abstractNote={Containers have become increasingly popular for deploying applications in cloud computing infrastructures. However, recent studies have shown that containers are prone to various security attacks. In this paper, we conduct a study on the effectiveness of various vulnerability detection schemes for containers. Specifically, we implement and evaluate a set of static and dynamic vulnerability attack detection schemes using 28 real world vulnerability exploits that widely exist in docker images. Our results show that the static vulnerability scanning scheme only detects 3 out of 28 tested vulnerabilities and dynamic anomaly detection schemes detect 22 vulnerability exploits. Combining static and dynamic schemes can further improve the detection rate to 86% (i.e., 24 out of 28 exploits). We also observe that the dynamic anomaly detection scheme can achieve more than 20 seconds lead time (i.e., a time window before attacks succeed) for a group of commonly seen attacks in containers that try to gain a shell and execute arbitrary code.}, journal={2019 IEEE INTERNATIONAL CONFERENCE ON CLOUD ENGINEERING (IC2E)}, author={Tunde-Onadele, Olufogorehan and He, Jingzhu and Dai, Ting and Gu, Xiaohui}, year={2019}, pages={121–127} } @article{kang_dai_jean-louis_tao_gu_2019, title={FabZK: Supporting Privacy-Preserving, Auditable Smart Contracts in Hyperledger Fabric}, ISSN={["1530-0889"]}, DOI={10.1109/DSN.2019.00061}, abstractNote={On a Blockchain network, transaction data are exposed to all participants. To preserve privacy and confidentiality in transactions, while still maintaining data immutability, we design and implement FabZK. FabZK conceals transaction details on a shared ledger by storing only encrypted data from each transaction (e.g., payment amount), and by anonymizing the transactional relationship (e.g., payer and payee) between members in a Blockchain network. It achieves both privacy and auditability by supporting verifiable Pedersen commitments and constructing zero-knowledge proofs. FabZK is implemented as an extension to the open source Hyperledger Fabric. It provides APIs to easily enable data privacy in both client code and chaincode. It also supports on-demand, automated auditing based on encrypted data. Our evaluation shows that FabZK offers strong privacy-preserving capabilities, while delivering reasonable performance for the applications developed based on its framework.}, journal={2019 49TH ANNUAL IEEE/IFIP INTERNATIONAL CONFERENCE ON DEPENDABLE SYSTEMS AND NETWORKS (DSN 2019)}, author={Kang, Hui and Dai, Ting and Jean-Louis, Nerla and Tao, Shu and Gu, Xiaohui}, year={2019}, pages={543–555} } @article{dai_dean_wang_gu_lu_2019, title={Hytrace: A Hybrid Approach to Performance Bug Diagnosis in Production Cloud Infrastructures}, volume={30}, ISSN={["1558-2183"]}, DOI={10.1109/TPDS.2018.2858800}, abstractNote={Server applications running inside production cloud infrastructures are prone to various performance problems (e.g., software hang, performance slowdown). When those problems occur, developers often have little clue to diagnose those problems. In this paper, we present Hytrace, a novel hybrid approach to diagnosing performance problems in production cloud infrastructures. Hytrace combines rule-based static analysis and runtime inference techniques to achieve higher bug localization accuracy than pure-static and pure-dynamic approaches for performance bugs. Hytrace does not require source code and can be applied to both compiled and interpreted programs such as C/C++ and Java. We conduct experiments using real performance bugs from seven commonly used server applications in production cloud infrastructures. The results show that our approach can significantly improve the performance bug diagnosis accuracy compared to existing diagnosis techniques.}, number={1}, journal={IEEE TRANSACTIONS ON PARALLEL AND DISTRIBUTED SYSTEMS}, author={Dai, Ting and Dean, Daniel and Wang, Peipei and Gu, Xiaohui and Lu, Shan}, year={2019}, month={Jan}, pages={107–118} } @article{he_dai_gu_2019, title={TFix: Automatic Timeout Bug Fixing in Production Server Systems}, ISSN={["1063-6927"]}, DOI={10.1109/ICDCS.2019.00067}, abstractNote={Timeout is widely used to handle unexpected failures in distributed systems. However, improper use of timeout schemes can cause serious availability and performance issues, which is often difficult to fix due to lack of diagnostic information. In this paper, we present TFix, an automatic timeout bug fixing system for correcting misused timeout bugs in production systems. TFix adopts a drill-down bug analysis protocol that can narrow down the root cause of a misused timeout bug and producing recommendations for correcting the root cause. TFix first employs a system call frequent episode mining scheme to check whether a timeout bug is caused by a misused timeout variable. TFix then employs application tracing to identify timeout affected functions. Next, TFix uses taint analysis to localize the misused timeout variable. Last, TFix produces recommendations for proper timeout variable values based on the tracing results during normal runs. We have implemented a prototype of TFix and conducted extensive experiments using 13 real world server timeout bugs. Our experimental results show that TFix can correctly localize the misused timeout variables and suggest proper timeout values for fixing those bugs.}, journal={2019 39TH IEEE INTERNATIONAL CONFERENCE ON DISTRIBUTED COMPUTING SYSTEMS (ICDCS 2019)}, author={He, Jingzhu and Dai, Ting and Gu, Xiaohui}, year={2019}, pages={612–623} } @article{dai_he_gu_lu_wang_2018, title={DScope: Detecting Real-World Data Corruption Hang Bugs in Cloud Server Systems}, DOI={10.1145/3267809.3267844}, abstractNote={Cloud server systems such as Hadoop and Cassandra have enabled many real-world data-intensive applications running inside computing clouds. However, those systems present many data-corruption and performance problems which are notoriously difficult to debug due to the lack of diagnosis information. In this paper, we present DScope, a tool that statically detects data-corruption related software hang bugs in cloud server systems. DScope statically analyzes I/O operations and loops in a software package, and identifies loops whose exit conditions can be affected by I/O operations through returned data, returned error code, or I/O exception handling. After identifying those loops which are prone to hang problems under data corruption, DScope conducts loop bound and loop stride analysis to prune out false positives. We have implemented DScope and evaluated it using 9 common cloud server systems. Our results show that DScope can detect 42 real software hang bugs including 29 newly discovered software hang bugs. In contrast, existing bug detection tools miss detecting most of those bugs.}, journal={PROCEEDINGS OF THE 2018 ACM SYMPOSIUM ON CLOUD COMPUTING (SOCC '18)}, author={Dai, Ting and He, Jingzhu and Gu, Xiaohui and Lu, Shan and Wang, Peipei}, year={2018}, pages={313–325} } @article{li_chen_liu_lu_zhang_gunawi_gu_lu_li_2018, title={PCatch: Automatically Detecting Performance Cascading Bugs in Cloud Systems}, DOI={10.1145/3190508.3190552}, abstractNote={Distributed systems have become the backbone of modern clouds. Users often expect high scalability and performance isolation from distributed systems. Unfortunately, a type of poor software design, which we refer to as performance cascading bugs (PCbugs), can often cause the slowdown of non-scalable code in one job to propagate, causing global performance degradation and even threatening system availability. This paper presents a tool, PCatch, that can automatically predict PCbugs by analyzing system execution under small-scale workloads. PCatch contains three key components in predicting PCbugs. It uses program analysis to identify code regions whose execution time can potentially increase dramatically with the workload size; it adapts the traditional happens-before model to reason about software resource contention and performance dependency relationship; it uses dynamic tracking to identify whether the slowdown propagation is contained in one job or not. Our evaluation using representative distributed systems, Cassandra, Hadoop MapReduce, HBase, and HDFS, shows that PCatch can accurately predict PCbugs based on small-scale workload execution.}, journal={EUROSYS '18: PROCEEDINGS OF THE THIRTEENTH EUROSYS CONFERENCE}, author={Li, Jiaxin and Chen, Yuxi and Liu, Haopeng and Lu, Shan and Zhang, Yiming and Gunawi, Haryadi S. and Gu, Xiaohui and Lu, Xicheng and Li, Dongsheng}, year={2018} } @article{he_dai_gu_2018, title={TScope: Automatic Timeout Bug Identification for Server Systems}, ISSN={["2474-0756"]}, DOI={10.1109/ICAC.2018.00010}, abstractNote={Timeout is commonly used to handle unexpected failures in server systems. However, improper use of timeout can cause server systems to hang or experience performance degradation. In this paper, we present TScope, an automatic timeout bug identification tool for server systems. TScope leverages kernel-level system call tracing and machine learning based anomaly detection and feature extraction schemes to achieve timeout bug identification. TScope introduces a unique system call selection scheme to achieve higher accuracy than existing generic performance bug detection tools. We have implemented a prototype of TScope and conducted extensive experiments using 19 real-world server performance bugs, including 12 timeout bugs and 7 non-timeout performance bugs. The experimental results show that TScope correctly classifies 18 out of 19 bugs. Compared to existing runtime bug detection schemes, TScope reduces the average false positive rate from 47.24% to 0.8%. TScope is light-weight and does not require application instrumentation, which makes it practical for production server performance bug identification.}, journal={15TH IEEE INTERNATIONAL CONFERENCE ON AUTONOMIC COMPUTING (ICAC 2018)}, author={He, Jingzhu and Dai, Ting and Gu, Xiaohui}, year={2018}, pages={1–10} } @article{dai_dean_wang_gu_lu_2017, title={Hytrace: A Hybrid Approach to Performance Bug Diagnosis in Production Cloud Infrastructures}, DOI={10.1145/3127479.3132562}, abstractNote={Server applications running inside production cloud infrastructures are prone to various performance problems (e.g., software hang, performance slowdown). When those problems occur, developers often have little clue to diagnose those problems. In this paper, we present Hytrace, a novel hybrid approach to diagnosing performance problems in production cloud infrastructures. Hytrace combines rule-based static analysis and runtime inference techniques to achieve higher bug localization accuracy than pure-static and pure-dynamic approaches for performance bugs. Hytrace does not require source code and can be applied to both compiled and interpreted programs such as C/C++ and Java. We conduct experiments using real performance bugs from seven commonly used server applications in production cloud infrastructures. The results show that our approach can significantly improve the performance bug diagnosis accuracy compared to existing diagnosis techniques.}, journal={PROCEEDINGS OF THE 2017 SYMPOSIUM ON CLOUD COMPUTING (SOCC '17)}, author={Dai, Ting and Dean, Daniel and Wang, Peipei and Gu, Xiaohui and Lu, Shan}, year={2017}, pages={641–641} } @article{shu_wang_gorski_andow_nadkarni_deshotels_gionta_enck_gu_2016, title={A Study of Security Isolation Techniques}, volume={49}, ISSN={["1557-7341"]}, DOI={10.1145/2988545}, abstractNote={Security isolation is a foundation of computing systems that enables resilience to different forms of attacks. This article seeks to understand existing security isolation techniques by systematically classifying different approaches and analyzing their properties. We provide a hierarchical classification structure for grouping different security isolation techniques. At the top level, we consider two principal aspects: mechanism and policy. Each aspect is broken down into salient dimensions that describe key properties. We break the mechanism into two dimensions, enforcement location and isolation granularity, and break the policy aspect down into three dimensions: policy generation, policy configurability, and policy lifetime. We apply our classification to a set of representative articles that cover a breadth of security isolation techniques and discuss tradeoffs among different design choices and limitations of existing approaches.}, number={3}, journal={ACM COMPUTING SURVEYS}, publisher={ACM}, author={Shu, Rui and Wang, Peipei and Gorski, Sigmund A. and Andow, Benjamin and Nadkarni, Adwait and Deshotels, Luke and Gionta, Jason and Enck, William and Gu, Xiaohui}, year={2016}, month={Dec} } @article{dean_nguyen_wang_gu_sailer_kochut_2016, title={PerfCompass: Online Performance Anomaly Fault Localization and Inference in Infrastructure-as-a-Service Clouds}, volume={27}, ISSN={["1558-2183"]}, DOI={10.1109/tpds.2015.2444392}, abstractNote={Infrastructure-as-a-service clouds are becoming widely adopted. However, resource sharing and multi-tenancy have made performance anomalies a top concern for users. Timely debugging those anomalies is paramount for minimizing the performance penalty for users. Unfortunately, this debugging often takes a long time due to the inherent complexity and sharing nature of cloud infrastructures. When an application experiences a performance anomaly, it is important to distinguish between faults with a global impact and faults with a local impact as the diagnosis and recovery steps forfaults with a global impact or local impact are quite different. In this paper, we present PerfCompass, an online performance anomaly fault debugging tool that can quantify whether a production-run performance anomaly has a global impact or local impact. PerfCompass can use this information to suggest the root cause as either an external fault (e.g., environment-based) or an internal fault (e.g., software bugs). Furthermore, PerfCompass can identify top affected system calls to provide useful diagnostic hints for detailed performance debugging. PerfCompass does not require source code or runtime application instrumentation, which makes it practical for production systems. We have tested PerfCompass by running five common open source systems (e.g., Apache, MySQL, Tomcat, Hadoop, Cassandra) inside a virtualized cloud testbed. Our experiments use a range of common infrastructure sharing issues and real software bugs. The results show that PerfCompass accurately classifies 23 out of the 24 tested cases without calibration and achieves 100 percent accuracy with calibration. PerfCompass provides useful diagnosis hints within several minutes and imposes negligible runtime overhead to the production system during normal execution time.}, number={6}, journal={IEEE TRANSACTIONS ON PARALLEL AND DISTRIBUTED SYSTEMS}, author={Dean, Daniel J. and Nguyen, Hiep and Wang, Peipei and Gu, Xiaohui and Sailer, Anca and Kochut, Andrzej}, year={2016}, month={Jun}, pages={1742–1755} } @inproceedings{das_mueller_gu_iyengar_2016, title={Performance analysis of a multi-tenant in-memory data grid}, DOI={10.1109/cloud.2016.0144}, abstractNote={Distributed key-value stores have become indispensable for large scale low latency applications. Many cloud services have deployed in-memory data grids for their enterprise infrastructures and support multi-tenancy services. But it is still difficult to provide consistent performance to all tenants for fluctuating workloads that need to scale out. Many popular key-value stores suffer from performance problems at scale and different tenant requirements. To this front, we present our study with Hazelcast, a popular open source data grid, and provide insights to contention and performance bottlenecks. Through experimental analysis, this paper uncovers scenarios of performance degradation followed by optimized performance via end-point multiplexing. Our study suggests that processing increasing number of client requests spawning fewer number of threads help improve performance.}, booktitle={Proceedings of 2016 ieee 9th international conference on cloud computing (cloud)}, author={Das, A. and Mueller, F. and Gu, X. H. and Iyengar, A.}, year={2016}, pages={956–959} } @inproceedings{wang_nguyen_gu_lu_2016, title={RDE: Replay DEbugging for Diagnosing Production Site Failures}, DOI={10.1109/srds.2016.050}, abstractNote={Online service failures in production computing environments are notoriously difficult to debug. One of the key challenges is to allow the developer to replay the failure execution within an interactive debugging tool such as GDB. Previous work has proposed in-situ approaches to inferring the production-run failure path within the production environment. However, those tools may sometimes suggest failure execution paths that are infeasible to reach by any program inputs. Moreover, production site often does not record or provide failure-triggering inputs due to the user privacy concern. In this paper, we present RDE, a Replay DEbug system that can replay a production-site failure at the development site within an interactive debugging environment without requiring user inputs. RDE takes an inferred production failure path as input and performs execution synthesis using a new guided symbolic execution technique. RDE can tolerate imprecise or inaccurate failure path information by navigating the symbolic execution along a set of selected paths. RDE synthesizes an input from the selected symbolic execution path which can be fed to a debugging tool to replay the failure. We have implemented an initial prototype of RDE and tested it with a set of coreutils bugs. The results show that RDE can successfully replay all the tested bugs within GDB.}, booktitle={Proceedings of 2016 ieee 35th symposium on reliable distributed systems (srds)}, author={Wang, P. P. and Nguyen, H. and Gu, X. H. and Lu, S.}, year={2016}, pages={327–336} } @article{dean_wang_gu_enck_jin_2015, title={Automatic Server Hang Bug Diagnosis: Feasible Reality or Pipe Dream?}, DOI={10.1109/icac.2015.52}, abstractNote={It is notoriously difficult to diagnose server hang bugs as they often generate little diagnostic information and are difficult to reproduce offline. In this paper, we present a characteristic study of 177 real software hang bugs from 8 common open source server systems (i.e., Apache, Lighttpd, My SQL, Squid, HDFS, Hadoop Mapreduce, Tomcat, Cassandra). We identify three major root cause categories (i.e., Programmer errors, mishandled values, concurrency issues). We then describe two major problems (i.e., False positives and false negatives) while applying existing rule-based bug detection techniques to those bugs.}, journal={2015 IEEE INTERNATIONAL CONFERENCE ON AUTONOMIC COMPUTING}, author={Dean, Daniel J. and Wang, Peipei and Gu, Xiaohui and Enck, William and Jin, Guoliang}, year={2015}, pages={127–132} } @article{wang_dean_gu_2015, title={Understanding Real World Data Corruptions in Cloud Systems}, DOI={10.1109/ic2e.2015.41}, abstractNote={Big data processing is one of the killer applications for cloud systems. MapReduce systems such as Hadoop are the most popular big data processing platforms used in the cloud system. Data corruption is one of the most critical problems in cloud data processing, which not only has serious impact on the integrity of individual application results but also affects the performance and availability of the whole data processing system. In this paper, we present a comprehensive study on 138 real world data corruption incidents reported in Hadoop bug repositories. We characterize those data corruption problems in four aspects: 1) what impact can data corruption have on the application and system? 2) how is data corruption detected? 3) what are the causes of the data corruption? and 4) what problems can occur while attempting to handle data corruption? Our study has made the following findings: 1) the impact of data corruption is not limited to data integrity, 2) existing data corruption detection schemes are quite insufficient: only 25% of data corruption problems are correctly reported, 42% are silent data corruption without any error message, and 21% receive imprecise error report. We also found the detection system raised 12% false alarms, 3) there are various causes of data corruption such as improper runtime checking, race conditions, inconsistent block states, improper network failure handling, and improper node crash handling, and 4) existing data corruption handling mechanisms (i.e., data replication, replica deletion, simple re-execution) make frequent mistakes including replicating corrupted data blocks, deleting uncorrupted data blocks, or causing undesirable resource hogging.}, journal={2015 IEEE INTERNATIONAL CONFERENCE ON CLOUD ENGINEERING (IC2E 2015)}, author={Wang, Peipei and Dean, Daniel J. and Gu, Xiaohui}, year={2015}, pages={116–125} } @article{du_dean_tan_gu_yu_2014, title={Scalable Distributed Service Integrity Attestation for Software-as-a-Service Clouds}, volume={25}, ISSN={["1558-2183"]}, DOI={10.1109/tpds.2013.62}, abstractNote={Software-as-a-service (SaaS) cloud systems enable application service providers to deliver their applications via massive cloud computing infrastructures. However, due to their sharing nature, SaaS clouds are vulnerable to malicious attacks. In this paper, we present IntTest, a scalable and effective service integrity attestation framework for SaaS clouds. IntTest provides a novel integrated attestation graph analysis scheme that can provide stronger attacker pinpointing power than previous schemes. Moreover, IntTest can automatically enhance result quality by replacing bad results produced by malicious attackers with good results produced by benign service providers. We have implemented a prototype of the IntTest system and tested it on a production cloud computing infrastructure using IBM System S stream processing applications. Our experimental results show that IntTest can achieve higher attacker pinpointing accuracy than existing approaches. IntTest does not require any special hardware or secure kernel support and imposes little performance impact to the application, which makes it practical for large-scale cloud systems.}, number={3}, journal={IEEE TRANSACTIONS ON PARALLEL AND DISTRIBUTED SYSTEMS}, author={Du, Juan and Dean, Daniel J. and Tan, Yongmin and Gu, Xiaohui and Yu, Ting}, year={2014}, month={Mar}, pages={730–739} } @article{nguyen_shen_tan_gu_2013, title={FChain: Toward Black-box Online Fault Localization for Cloud Systems}, ISSN={["1063-6927"]}, DOI={10.1109/icdcs.2013.26}, abstractNote={Distributed applications running inside cloud systems are prone to performance anomalies due to various reasons such as resource contentions, software bugs, and hardware failures. One big challenge for diagnosing an abnormal distributed application is to pinpoint the faulty components. In this paper, we present a black-box online fault localization system called FChain that can pinpoint faulty components immediately after a performance anomaly is detected. FChain first discovers the onset time of abnormal behaviors at different components by distinguishing the abnormal change point from many change points caused by normal workload fluctuations. Faulty components are then pinpointed based on the abnormal change propagation patterns and inter-component dependency relationships. FChain performs runtime validation to further filter out false alarms. We have implemented FChain on top of the Xen platform and tested it using several benchmark applications (RUBiS, Hadoop, and IBM System S). Our experimental results show that FChain can quickly pinpoint the faulty components with high accuracy within a few seconds. FChain can achieve up to 90% higher precision and 20% higher recall than existing schemes. FChain is non-intrusive and light-weight, which imposes less than 1% overhead to the cloud system.}, journal={2013 IEEE 33RD INTERNATIONAL CONFERENCE ON DISTRIBUTED COMPUTING SYSTEMS (ICDCS)}, author={Nguyen, Hiep and Shen, Zhiming and Tan, Yongmin and Gu, Xiaohui}, year={2013}, pages={21–30} } @article{tan_venkatesh_gu_2013, title={resilient self-compressive monitoring for large-scale hosting infrastructures}, volume={24}, DOI={10.1109/tpds.2012.167}, abstractNote={Large-scale hosting infrastructures have become the fundamental platforms for many real-world systems such as cloud computing infrastructures, enterprise data centers, and massive data processing systems. However, it is a challenging task to achieve both scalability and high precision while monitoring a large number of intranode and internode attributes (e.g., CPU usage, free memory, free disk, internode network delay). In this paper, we present the design and implementation of a Resilient self-Compressive Monitoring (RCM) system for large-scale hosting infrastructures. RCM achieves scalable distributed monitoring by performing online data compression to reduce remote data collection cost. RCM provides failure resilience to achieve robust monitoring for dynamic distributed systems where host and network failures are common. We have conducted extensive experiments using a set of real monitoring data from NCSU's virtual computing lab (VCL), PlanetLab, a Google cluster, and real Internet traffic matrices. The experimental results show that RCM can achieve up to 200 percent higher compression ratio and several orders of magnitude less overhead than the existing approaches.}, number={3}, journal={IEEE Transactions on Parallel and Distributed Systems}, author={Tan, Y. M. and Venkatesh, V. and Gu, X. H.}, year={2013}, pages={576–586} } @article{tan_nguyen_shen_gu_venkatramani_rajan_2012, title={PREPARE: Predictive Performance Anomaly Prevention for Virtualized Cloud Systems}, ISSN={["1063-6927"]}, DOI={10.1109/icdcs.2012.65}, abstractNote={Virtualized cloud systems are prone to performance anomalies due to various reasons such as resource contentions, software bugs, and hardware failures. In this paper, we present a novel Predictive Performance Anomaly Prevention (PREPARE) system that provides automatic performance anomaly prevention for virtualized cloud computing infrastructures. PREPARE integrates online anomaly prediction, learning-based cause inference, and predictive prevention actuation to minimize the performance anomaly penalty without human intervention. We have implemented PREPARE on top of the Xen platform and tested it on the NCSU's Virtual Computing Lab using a commercial data stream processing system (IBM System S) and an online auction benchmark (RUBiS). The experimental results show that PREPARE can effectively prevent performance anomalies while imposing low overhead to the cloud infrastructure.}, journal={2012 IEEE 32ND INTERNATIONAL CONFERENCE ON DISTRIBUTED COMPUTING SYSTEMS (ICDCS)}, author={Tan, Yongmin and Nguyen, Hiep and Shen, Zhiming and Gu, Xiaohui and Venkatramani, Chitra and Rajan, Deepak}, year={2012}, pages={285–294} } @article{kc_gu_2011, title={ELT: Efficient Log-based Troubleshooting System for Cloud Computing Infrastructures}, ISSN={["2575-8462"]}, DOI={10.1109/srds.2011.11}, abstractNote={We present an Efficient Log-based Troubleshooting(ELT) system for cloud computing infrastructures. ELT adopts a novel hybrid log mining approach that combines coarse-grained and fine-grained log features to achieve both high accuracy and low overhead. Moreover, ELT can automatically extract key log messages and perform invariant checking to greatly simplify the troubleshooting task for the system administrator. We have implemented a prototype of the ELT system and conducted an extensive experimental study using real management console logs of a production cloud system and a Hadoop cluster. Our experimental results show that ELT can achieve more efficient and powerful troubleshooting support than existing schemes. More importantly, ELT can find software bugs that cannot be detected by current cloud system management practice.}, journal={2011 30TH IEEE INTERNATIONAL SYMPOSIUM ON RELIABLE DISTRIBUTED SYSTEMS (SRDS)}, author={Kc, Kamal and Gu, Xiaohui}, year={2011}, pages={11–20} } @article{tan_gu_wang_2010, title={Adaptive System Anomaly Prediction for Large-Scale Hosting Infrastructures}, ISBN={["978-1-60558-888-9"]}, DOI={10.1145/1835698.1835741}, abstractNote={Large-scale hosting infrastructures require automatic system anomaly management to achieve continuous system operation. In this paper, we present a novel adaptive runtime anomaly prediction system, called ALERT, to achieve robust hosting infrastructures. In contrast to traditional anomaly detection schemes, ALERT aims at raising advance anomaly alerts to achieve just-in-time anomaly prevention. We propose a novel context-aware anomaly prediction scheme to improve prediction accuracy in dynamic hosting infrastructures. We have implemented the ALERT system and deployed it on several production hosting infrastructures such as IBM System S stream processing cluster and PlanetLab. Our experiments show that ALERT can achieve high prediction accuracy for a range of system anomalies and impose low overhead to the hosting infrastructure.}, journal={PODC 2010: PROCEEDINGS OF THE 2010 ACM SYMPOSIUM ON PRINCIPLES OF DISTRIBUTED COMPUTING}, author={Tan, Yongmin and Gu, Xiaohui and Wang, Haixun}, year={2010}, pages={173–182} } @inproceedings{gu_wang_2009, title={Online anomaly prediction for robust cluster systems}, DOI={10.1109/icde.2009.128}, abstractNote={In this paper, we present a stream-based mining algorithm for online anomaly prediction. Many real-world applications such as data stream analysis requires continuous cluster operation. Unfortunately, today's large-scale cluster systems are still vulnerable to various software and hardware problems. System administrators are often overwhelmed by the tasks of correcting various system anomalies such as processing bottlenecks (i.e., full stream buffers), resource hot spots, and service level objective (SLO) violations. Our anomaly prediction scheme raises early alerts for impending system anomalies and suggests possible anomaly causes. Specifically, we employ Bayesian classification methods to capture different anomaly symptoms and infer anomaly causes. Markov models are introduced to capture the changing patterns of different measurement metrics. More importantly, our scheme combines Markov models and Bayesian classification methods to predict when a system anomaly will appear in the foreseeable future and what are the possible anomaly causes. To the best of our knowledge, our work provides the first stream-based mining algorithm for predicting system anomalies. We have implemented our approach within the IBM System S distributed stream processing cluster, and conducted case study experiments using fully implemented distributed data analysis applications processing real application workloads. Our experiments show that our approach efficiently predicts and diagnoses several bottleneck anomalies with high accuracy while imposing low overhead to the cluster system.}, booktitle={Icde: 2009 ieee 25th international conference on data engineering, vols 1-3}, author={Gu, X. H. and Wang, H. X.}, year={2009}, pages={1000–1011} } @article{repantis_gu_kalogeraki_2009, title={QoS-Aware Shared Component Composition for Distributed Stream Processing Systems}, volume={20}, ISSN={["1558-2183"]}, DOI={10.1109/TPDS.2008.165}, abstractNote={Many emerging online data analysis applications require applying continuous query operations such as correlation, aggregation, and filtering to data streams in real time. Distributed stream processing systems allow in-network stream processing to achieve better scalability and quality-of-service (QoS) provision. In this paper, we present Synergy, a novel distributed stream processing middleware that provides automatic sharing-aware component composition capability. Synergy enables efficient reuse of both result streams and processing components, while composing distributed stream processing applications with QoS demands. It provides a set of fully distributed algorithms to discover and evaluate the reusability of available result streams and processing components when instantiating new stream applications. Specifically, Synergy performs QoS impact projection to examine whether the shared processing can cause QoS violations on currently running applications. The QoS impact projection algorithm can handle different types of streams including both regular traffic and bursty traffic. If no existing processing components can be reused, Synergy dynamically deploys new components at strategic locations to satisfy new application requests. We have implemented a prototype of the Synergy middleware and evaluated its performance on both PlanetLab and simulation testbeds. The experimental results show that Synergy can achieve much better resource utilization and QoS provisioning than previously proposed schemes, by judiciously sharing streams and components during application composition.}, number={7}, journal={IEEE TRANSACTIONS ON PARALLEL AND DISTRIBUTED SYSTEMS}, author={Repantis, Thomas and Gu, Xiaohui and Kalogeraki, Vana}, year={2009}, month={Jul}, pages={968–982} } @article{wei_du_yu_gu_2009, title={SecureMR: A Service Integrity Assurance Framework for Map Reduce}, ISBN={["978-0-7695-3919-5"]}, DOI={10.1109/acsac.2009.17}, abstractNote={MapReduce has become increasingly popular as a powerful parallel data processing model. To deploy MapReduce as a data processing service over open systems such as service oriented architecture, cloud computing, and volunteer computing, we must provide necessary security mechanisms to protect the integrity of MapReduce data processing services. In this paper, we present SecureMR, a practical service integrity assurance framework for MapReduce. SecureMR consists of five security components, which provide a set of practical security mechanisms that not only ensure MapReduce service integrity as well as to prevent replay and Denial of Service (DoS) attacks, but also preserve the simplicity, applicability and scalability of MapReduce. We have implemented a prototype of SecureMR based on Hadoop, an open source MapReduce implementation. Our analytical study and experimental results show that SecureMR can ensure data processing service integrity while imposing low performance overhead.}, journal={25TH ANNUAL COMPUTER SECURITY APPLICATIONS CONFERENCE}, author={Wei, Wei and Du, Juan and Yu, Ting and Gu, Xiaohui}, year={2009}, pages={73–82} } @inproceedings{gong_ramaswamy_gu_ma_2009, title={SigLM: signature-driven load management for cloud computing infrastructures}, booktitle={Iwqos: 2009 ieee 17th international workshop on quality of service}, author={Gong, Z. H. and Ramaswamy, P. and Gu, X. H. and Ma, X. S.}, year={2009}, pages={226–234} } @article{gu_wen_yu_shae_2008, title={peerTalk: A peer-to-peer multiparty voice-over-IP system}, volume={19}, ISSN={["1558-2183"]}, DOI={10.1109/TPDS.2007.70766}, abstractNote={Multiparty voice-over-IP (MVolP) services allow a group of people to freely communicate with each other via the Internet, which have many important applications such as online gaming and teleconferencing. In this paper, we present a peer-to-peer MVolP system called peerTalk. Compared to traditional approaches such as server-based mixing, peerTalk achieves better scalability and failure resilience by dynamically distributing the stream processing workload among different peers. Particularly, peerTalk decouples the MVolP service delivery into two phases: mixing phase and distribution phase. The decoupled model allows us to explore the asymmetric property of MVolP services (for example, distinct speaking/listening activities and unequal inbound/outbound bandwidths) so that the system can better adapt to distinct stream mixing and distribution requirements. To overcome arbitrary peer departures/ failures, peerTalk provides lightweight backup schemes to achieve fast failure recovery. We have implemented a prototype of the peerTalk system and evaluated its performance using both a large-scale simulation testbed and a real Internet environment. Our initial implementation demonstrates the feasibility of our approach and shows promising results: peerTalk can outperform existing approaches such as P2P overlay multicast and coupled distributed processing for providing MVolP services.}, number={4}, journal={IEEE TRANSACTIONS ON PARALLEL AND DISTRIBUTED SYSTEMS}, author={Gu, Xiaohui and Wen, Zhen and Yu, Philip S. and Shae, Zon-Yin}, year={2008}, month={Apr}, pages={515–528} } @inbook{repantis_gu_kalogeraki_2006, title={Synergy: Sharing-Aware Component Composition for Distributed Stream Processing Systems}, ISBN={9783540490234 9783540682561}, ISSN={0302-9743 1611-3349}, url={http://dx.doi.org/10.1007/11925071_17}, DOI={10.1007/11925071_17}, abstractNote={Many emerging on-line data analysis applications require applying continuous query operations such as correlation, aggregation, and filtering to data streams in real-time. Distributed stream processing systems allow in-network stream processing to achieve better scalability and quality-of-service (QoS) provision. In this paper we present Synergy, a distributed stream processing middleware that provides sharing-aware component composition. Synergy enables efficient reuse of both data streams and processing components, while composing distributed stream processing applications with QoS demands. Synergy provides a set of fully distributed algorithms to discover and evaluate the reusability of available data streams and processing components when instantiating new stream applications. For QoS provision, Synergy performs QoS impact projection to examine whether the shared processing can cause QoS violations on currently running applications. We have implemented a prototype of the Synergy middleware and evaluated its performance on both PlanetLab and simulation testbeds. The experimental results show that Synergy can achieve much better resource utilization and QoS provision than previously proposed schemes, by judiciously sharing streams and processing components during application composition.}, booktitle={Lecture Notes in Computer Science}, publisher={Springer Berlin Heidelberg}, author={Repantis, Thomas and Gu, Xiaohui and Kalogeraki, Vana}, year={2006}, pages={322–341} } @inbook{gu_yu_2005, title={Adaptive Load Diffusion for Stream Joins}, ISBN={9783540303237 9783540322696}, ISSN={0302-9743 1611-3349}, url={http://dx.doi.org/10.1007/11587552_22}, DOI={10.1007/11587552_22}, abstractNote={Data stream processing has become increasingly important as many emerging applications call for sophisticated realtime processing over data streams, such as stock trading surveillance, network traffic monitoring, and sensor data analysis. Stream joins are among the most important stream processing operations, which can be used to detect linkages and correlations between different data streams. One major challenge in processing stream joins is to handle continuous, high-volume, and time-varying data streams under resource constraints. In this paper, we present a novel load diffusion system to enable scalable execution of resource-intensive stream joins using an ensemble of server hosts. The load diffusion is achieved by a simple correlation-aware stream partition algorithm. Different from previous work, the load diffusion system can (1) achieve fine-grained load sharing in the distributed stream processing system; and (2) produce exact query answers without missing any join results or generate duplicate join results. Our experimental results show that the load diffusion scheme can greatly improve the system throughput and achieve more balanced load distribution.}, booktitle={Middleware 2005}, publisher={Springer Berlin Heidelberg}, author={Gu, Xiaohui and Yu, Philip S.}, year={2005}, pages={411–420} }