@article{tu_yu_menzies_2022, title={Better Data Labelling With EMBLEM (and how that Impacts Defect Prediction)}, volume={48}, ISSN={["1939-3520"]}, url={https://doi.org/10.1109/TSE.2020.2986415}, DOI={10.1109/TSE.2020.2986415}, abstractNote={Standard automatic methods for recognizing problematic development commits can be greatly improved via the incremental application of human+artificial expertise. In this approach, called EMBLEM, an AI tool first explore the software development process to label commits that are most problematic. Humans then apply their expertise to check those labels (perhaps resulting in the AI updating the support vectors within their SVM learner). We recommend this human+AI partnership, for several reasons. When a new domain is encountered, EMBLEM can learn better ways to label which comments refer to real problems. Also, in studies with 9 open source software projects, labelling via EMBLEM's incremental application of human+AI is at least an order of magnitude cheaper than existing methods ($\approx$ eight times). Further, EMBLEM is very effective. For the data sets explored here, EMBLEM better labelling methods significantly improved $P_{opt}20$Popt20 and G-scores performance in nearly all the projects studied here.}, number={1}, journal={IEEE TRANSACTIONS ON SOFTWARE ENGINEERING}, publisher={Institute of Electrical and Electronics Engineers (IEEE)}, author={Tu, Huy and Yu, Zhe and Menzies, Tim}, year={2022}, month={Jan}, pages={278–294} } @article{tu_menzies_2022, title={DebtFree: minimizing labeling cost in self-admitted technical debt identification using semi-supervised learning}, volume={27}, ISSN={["1573-7616"]}, DOI={10.1007/s10664-022-10121-w}, abstractNote={Keeping track of and managing Self-Admitted Technical Debts (SATDs) is important for maintaining a healthy software project. Current active-learning SATD recognition tool involves manual inspection of 24% of the test comments on average to reach 90% of the recall. Among all the test comments, about 5% are SATDs. The human experts are then required to read almost a quintuple of the SATD comments which indicates the inefficiency of the tool. Plus, human experts are still prone to error: 95% of the false-positive labels from previous work were actually true positives. To solve the above problems, we propose DebtFree, a two-mode framework based on unsupervised learning for identifying SATDs. In mode1, when the existing training data is unlabeled, DebtFree starts with an unsupervised learner to automatically pseudo-label the programming comments in the training data. In contrast, in mode2 where labels are available with the corresponding training data, DebtFree starts with a pre-processor that identifies the highly prone SATDs from the test dataset. Then, our machine learning model is employed to assist human experts in manually identifying the remaining SATDs. Our experiments on 10 software projects show that both models yield statistically significant improvement in effectiveness over the state-of-the-art automated and semi-automated models. Specifically, DebtFree can reduce the labeling effort by 99% in mode1 (unlabeled training data), and up to 63% in mode2 (labeled training data) while improving the current active learner’s F1 relatively to almost 100%.}, number={4}, journal={EMPIRICAL SOFTWARE ENGINEERING}, author={Tu, Huy and Menzies, Tim}, year={2022}, month={Jul} } @article{chakraborty_majumder_tu_2022, title={Fair-SSL: Building fair ML Software with less data}, DOI={10.1145/3524491.3527305}, abstractNote={Ethical bias in machine learning models has become a matter of concern in the software engineering community. Most of the prior software engineering works concentrated on finding ethical bias in models rather than fixing it. After finding bias, the next step is mitigation. Prior researchers mainly tried to use supervised approaches to achieve fairness. However, in the real world, getting data with trustworthy ground truth is challenging and also ground truth can contain human bias. Semi-supervised learning is a technique where, incrementally, labeled data is used to generate pseudo-labels for the rest of data (and then all that data is used for model training). In this work, we apply four popular semi-supervised techniques as pseudo-labelers to create fair classification models. Our framework, Fair-SSL, takes a very small amount (10%) of labeled data as input and generates pseudo-labels for the unlabeled data. We then synthetically generate new data points to balance the training data based on class and protected attribute as proposed by Chakraborty et al. in FSE 2021. Finally, classification model is trained on the balanced pseudo-labeled data and validated on test data. After experimenting on ten datasets and three learners, we find that Fair-SSL achieves similar performance as three state-of-the-art bias mitigation algorithms. That said, the clear advantage of Fair-SSL is that it requires only 10% of the labeled training data. To the best of our knowledge, this is the first SE work where semi-supervised techniques are used to fight against ethical bias in SE ML models. To facilitate open science and replication, all our source code and datasets are publicly available at https://github.com/joymallyac/FairSSL. CCS CONCEPTS • Software and its engineering → Software creation and management; • Computing methodologies → Machine learning. ACM Reference Format: Joymallya Chakraborty, Suvodeep Majumder, and Huy Tu. 2022. Fair-SSL: Building fair ML Software with less data. In International Workshop on Equitable Data and Technology (FairWare ‘22), May 9, 2022, Pittsburgh, PA, USA. ACM, New York, NY, USA, 8 pages. https://doi.org/10.1145/3524491.3527305}, journal={2022 IEEE/ACM INTERNATIONAL WORKSHOP ON EQUITABLE DATA & TECHNOLOGY (FAIRWARE 2022)}, author={Chakraborty, Joymallya and Majumder, Suvodeep and Tu, Huy}, year={2022}, pages={1–8} } @article{yu_fahid_tu_menzies_2022, title={Identifying Self-Admitted Technical Debts With Jitterbug: A Two-Step Approach}, volume={48}, ISSN={["1939-3520"]}, url={https://doi.org/10.1109/TSE.2020.3031401}, DOI={10.1109/TSE.2020.3031401}, abstractNote={Keeping track of and managing Self-Admitted Technical Debts (SATDs) are important to maintaining a healthy software project. This requires much time and effort from human experts to identify the SATDs manually. The current automated solutions do not have satisfactory precision and recall in identifying SATDs to fully automate the process. To solve the above problems, we propose a two-step framework called Jitterbug for identifying SATDs. Jitterbug first identifies the “easy to find” SATDs automatically with close to 100 percent precision using a novel pattern recognition technique. Subsequently, machine learning techniques are applied to assist human experts in manually identifying the remaining “hard to find” SATDs with reduced human effort. Our simulation studies on ten software projects show that Jitterbug can identify SATDs more efficiently (with less human effort) than the prior state-of-the-art methods.}, number={5}, journal={IEEE TRANSACTIONS ON SOFTWARE ENGINEERING}, publisher={Institute of Electrical and Electronics Engineers (IEEE)}, author={Yu, Zhe and Fahid, Fahmid Morshed and Tu, Huy and Menzies, Tim}, year={2022}, month={May}, pages={1676–1691} } @article{tu_menzies_2021, title={FRUGAL: Unlocking Semi-Supervised Learning for Software Analytics}, DOI={10.1109/ASE51524.2021.9678617}, abstractNote={Standard software analytics often involves having a large amount of data with labels in order to commission models with acceptable performance. However, prior work has shown that such requirements can be expensive, taking several weeks to label thousands of commits, and not always available when traversing new research problems and domains. Unsupervised Learning is a promising direction to learn hidden patterns within unlabelled data, which has only been extensively studied in defect prediction. Nevertheless, unsupervised learning can be ineffective by itself and has not been explored in other domains (e.g., static analysis and issue close time).Motivated by this literature gap and technical limitations, we present FRUGAL, a tuned semi-supervised method that builds on a simple optimization scheme that does not require sophisticated (e.g., deep learners) and expensive (e.g., 100% manually labelled data) methods. FRUGAL optimizes the unsupervised learner’s configurations (via a simple grid search) while validating our design decision of labelling just 2.5% of the data before prediction.As shown by the experiments of this paper FRUGAL outperforms the state-of-the-art adoptable static code warning recognizer and issue closed time predictor, while reducing the cost of labelling by a factor of 40 (from 100% to 2.5%). Hence we assert that FRUGAL can save considerable effort in data labelling especially in validating prior work or researching new problems.Based on this work, we suggest that proponents of complex and expensive methods should always baseline such methods against simpler and cheaper alternatives. For instance, a semi-supervised learner like FRUGAL can serve as a baseline to the state-of-theart software analytics.}, journal={2021 36TH IEEE/ACM INTERNATIONAL CONFERENCE ON AUTOMATED SOFTWARE ENGINEERING ASE 2021}, author={Tu, Huy and Menzies, Tim}, year={2021}, pages={394–406} } @article{tu_papadimitriou_kiran_wang_mandal_deelman_menzies_2021, title={Mining Workflows for Anomalous Data Transfers}, ISSN={["2160-1852"]}, DOI={10.1109/MSR52588.2021.00013}, abstractNote={Modern scientific workflows are data-driven and are often executed on distributed, heterogeneous, high-performance computing infrastructures. Anomalies and failures in the work-flow execution cause loss of scientific productivity and inefficient use of the infrastructure. Hence, detecting, diagnosing, and mitigating these anomalies are immensely important for reliable and performant scientific workflows. Since these workflows rely heavily on high-performance network transfers that require strict QoS constraints, accurately detecting anomalous network performance is crucial to ensure reliable and efficient workflow execution. To address this challenge, we have developed X-FLASH, a network anomaly detection tool for faulty TCP workflow transfers. X-FLASH incorporates novel hyperparameter tuning and data mining approaches for improving the performance of the machine learning algorithms to accurately classify the anomalous TCP packets. X-FLASH leverages XGBoost as an ensemble model and couples XGBoost with a sequential optimizer, FLASH, borrowed from search-based Software Engineering to learn the optimal model parameters. X-FLASH found configurations that outperformed the existing approach up to 28%, 29%, and 40% relatively for F-measure, G-score, and recall in less than 30 evaluations. From (1) large improvement and (2) simple tuning, we recommend future research to have additional tuning study as a new standard, at least in the area of scientific workflow anomaly detection.}, journal={2021 IEEE/ACM 18TH INTERNATIONAL CONFERENCE ON MINING SOFTWARE REPOSITORIES (MSR 2021)}, author={Tu, Huy and Papadimitriou, George and Kiran, Mariam and Wang, Cong and Mandal, Anirban and Deelman, Ewa and Menzies, Tim}, year={2021}, pages={1–12} } @article{tu_nair_2018, title={Is One Hyperparameter Optimizer Enough?}, DOI={10.1145/3278142.3278145}, abstractNote={Hyperparameter tuning is the black art of automatically finding a good combination of control parameters for a data miner. While widely applied in empirical Software Engineering, there has not been much discussion on which hyperparameter tuner is best for software analytics.To address this gap in the literature, this paper applied a range of hyperparameter optimizers (grid search, random search, differential evolution, and Bayesian optimization) to a defect prediction problem. Surprisingly, no hyperparameter optimizer was observed to be “best” and, for one of the two evaluation measures studied here (F-measure), hyperparameter optimization, in 50% of cases, was no better than using default configurations. We conclude that hyperparameter optimization is more nuanced than previously believed. While such optimization can certainly lead to large improvements in the performance of classifiers used in software analytics, it remains to be seen which specific optimizers should be applied to a new dataset.}, journal={PROCEEDINGS OF THE 4TH ACM SIGSOFT INTERNATIONAL WORKSHOP ON SOFTWARE ANALYTICS (SWAN'18)}, author={Tu, Huy and Nair, Vivek}, year={2018}, pages={19–25} }