@article{majumder_chakraborty_menzies_2024, title={When less is more: on the value of "co-training" for semi-supervised software defect predictors}, volume={29}, ISSN={["1573-7616"]}, DOI={10.1007/s10664-023-10418-4}, number={2}, journal={EMPIRICAL SOFTWARE ENGINEERING}, author={Majumder, Suvodeep and Chakraborty, Joymallya and Menzies, Tim}, year={2024}, month={Mar} } @article{majumder_reich_2023, title={A deep learning synthetic likelihood approximation of a non-stationary spatial model for extreme streamflow forecasting}, volume={55}, ISSN={["2211-6753"]}, DOI={10.1016/j.spasta.2023.100755}, abstractNote={Extreme streamflow is a key indicator of flood risk, and quantifying the changes in its distribution under non-stationary climate conditions is key to mitigating the impact of flooding events. We propose a non-stationary process mixture model (NPMM) for annual streamflow maxima over the central US (CUS) which uses downscaled climate model precipitation projections to forecast extremal streamflow. Spatial dependence for the model is specified as a convex combination of transformed Gaussian and max-stable processes, indexed by a weight parameter which identifies the asymptotic regime of the process. The weight parameter is modeled as a function of the annual precipitation for each of the two hydrologic regions within the CUS, introducing spatio-temporal non-stationarity within the model. The NPMM is flexible with desirable tail dependence properties, but yields an intractable likelihood. To address this, we embed a neural network within a density regression model which is used to learn a synthetic likelihood function using simulations from the NPMM with different parameter settings. Our model is fitted using observational data for 1972--2021, and inference carried out in a Bayesian framework. The two regions within the CUS are estimated to be in different asymptotic regimes based on the posterior distribution of the weight parameter. Annual streamflow maxima estimates based on global climate models for two representative climate pathway scenarios suggest an overall increase in the frequency and magnitude of extreme streamflow for 2006-2035 compared to the historical period of 1972-2005.}, journal={SPATIAL STATISTICS}, author={Majumder, Reetam and Reich, Brian J.}, year={2023}, month={Jun} } @article{majumder_chakraborty_bai_stolee_menzies_2023, title={Fair Enough: Searching for Sufficient Measures of Fairness}, volume={32}, ISSN={["1557-7392"]}, url={https://doi.org/10.1145/3585006}, DOI={10.1145/3585006}, abstractNote={Testing machine learning software for ethical bias has become a pressing current concern. In response, recent research has proposed a plethora of new fairness metrics, for example, the dozens of fairness metrics in the IBM AIF360 toolkit. This raises the question: How can any fairness tool satisfy such a diverse range of goals? While we cannot completely simplify the task of fairness testing, we can certainly reduce the problem. This article shows that many of those fairness metrics effectively measure the same thing. Based on experiments using seven real-world datasets, we find that (a) 26 classification metrics can be clustered into seven groups and (b) four dataset metrics can be clustered into three groups. Further, each reduced set may actually predict different things. Hence, it is no longer necessary (or even possible) to satisfy all fairness metrics. In summary, to simplify the fairness testing problem, we recommend the following steps: (1) determine what type of fairness is desirable (and we offer a handful of such types), then (2) lookup those types in our clusters, and then (3) just test for one item per cluster. For the purpose of reproducibility, our scripts and data are available at https://github.com/Repoanon ymous/Fairness_Metrics.}, number={6}, journal={ACM TRANSACTIONS ON SOFTWARE ENGINEERING AND METHODOLOGY}, author={Majumder, Suvodeep and Chakraborty, Joymallya and Bai, Gina R. and Stolee, Kathryn T. and Menzies, Tim}, year={2023}, month={Nov} } @article{chakraborty_majumder_tu_2022, title={Fair-SSL: Building fair ML Software with less data}, DOI={10.1145/3524491.3527305}, abstractNote={Ethical bias in machine learning models has become a matter of concern in the software engineering community. Most of the prior software engineering works concentrated on finding ethical bias in models rather than fixing it. After finding bias, the next step is mitigation. Prior researchers mainly tried to use supervised approaches to achieve fairness. However, in the real world, getting data with trustworthy ground truth is challenging and also ground truth can contain human bias. Semi-supervised learning is a technique where, incrementally, labeled data is used to generate pseudo-labels for the rest of data (and then all that data is used for model training). In this work, we apply four popular semi-supervised techniques as pseudo-labelers to create fair classification models. Our framework, Fair-SSL, takes a very small amount (10%) of labeled data as input and generates pseudo-labels for the unlabeled data. We then synthetically generate new data points to balance the training data based on class and protected attribute as proposed by Chakraborty et al. in FSE 2021. Finally, classification model is trained on the balanced pseudo-labeled data and validated on test data. After experimenting on ten datasets and three learners, we find that Fair-SSL achieves similar performance as three state-of-the-art bias mitigation algorithms. That said, the clear advantage of Fair-SSL is that it requires only 10% of the labeled training data. To the best of our knowledge, this is the first SE work where semi-supervised techniques are used to fight against ethical bias in SE ML models. To facilitate open science and replication, all our source code and datasets are publicly available at https://github.com/joymallyac/FairSSL. CCS CONCEPTS • Software and its engineering → Software creation and management; • Computing methodologies → Machine learning. ACM Reference Format: Joymallya Chakraborty, Suvodeep Majumder, and Huy Tu. 2022. Fair-SSL: Building fair ML Software with less data. In International Workshop on Equitable Data and Technology (FairWare ‘22), May 9, 2022, Pittsburgh, PA, USA. ACM, New York, NY, USA, 8 pages. https://doi.org/10.1145/3524491.3527305}, journal={2022 IEEE/ACM INTERNATIONAL WORKSHOP ON EQUITABLE DATA & TECHNOLOGY (FAIRWARE 2022)}, author={Chakraborty, Joymallya and Majumder, Suvodeep and Tu, Huy}, year={2022}, pages={1–8} } @article{majumder_xia_krishna_menzies_2022, title={Methods for Stabilizing Models Across Large Samples of Projects (with case studies on Predicting Defect and Project Health)}, ISSN={["2160-1852"]}, DOI={10.1145/3524842.3527934}, abstractNote={Despite decades of research, Software Engineering (SE) lacks widely accepted models (that offer precise quantitative stable predictions) about what factors most influence software quality. This paper provides a promising result showing such stable models can be generated using a new transfer learning framework called “STABILIZER”. Given a tree of recursively clustered projects (using project meta-data), STABILIZER promotes a model upwards if it performs best in the lower clusters (stopping when the promoted model performs worse than the models seen at a lower level). The number of models found by STABILIZER is minimal: one for defect prediction (756 projects) and less than a dozen for project health (1628 projects). Hence, via STABILIZER, it is possible to find a few projects which can be used for transfer learning and make conclusions that hold across hundreds of projects at a time. Further, the models produced in this manner offer predictions that perform as well or better than the prior state-of-the-art. To the best of our knowledge, STABILIZER is order of magnitude faster than the prior state-of-the-art transfer learners which seek to find conclusion stability, and these case studies are the largest demonstration of the generalizability of quantitative predictions of project quality yet reported in the SE literature. In order to support open science, all our scripts and data are online at https://github.com/Anonymous633671/STABILIZER.}, journal={2022 MINING SOFTWARE REPOSITORIES CONFERENCE (MSR 2022)}, author={Majumder, Suvodeep and Xia, Tianpei and Krishna, Rahul and Menzies, Tim}, year={2022}, pages={566–578} } @article{majumder_mody_menzies_2022, title={Revisiting process versus product metrics: a large scale analysis}, volume={27}, ISSN={["1573-7616"]}, DOI={10.1007/s10664-021-10068-4}, abstractNote={Numerous methods can build predictive models from software data. However, what methods and conclusions should we endorse as we move from analytics in-the-small (dealing with a handful of projects) to analytics in-the-large (dealing with hundreds of projects)? To answer this question, we recheck prior small-scale results (about process versus product metrics for defect prediction and the granularity of metrics) using 722,471 commits from 700 Github projects. We find that some analytics in-the-small conclusions still hold when scaling up to analytics in-the-large. For example, like prior work, we see that process metrics are better predictors for defects than product metrics (best process/product-based learners respectively achieve recalls of 98%/44% and AUCs of 95%/54%, median values). That said, we warn that it is unwise to trust metric importance results from analytics in-the-small studies since those change dramatically when moving to analytics in-the-large. Also, when reasoning in-the-large about hundreds of projects, it is better to use predictions from multiple models (since single model predictions can become confused and exhibit a high variance).}, number={3}, journal={EMPIRICAL SOFTWARE ENGINEERING}, author={Majumder, Suvodeep and Mody, Pranav and Menzies, Tim}, year={2022}, month={May} } @article{chakraborty_majumder_menzies_2021, title={Bias in Machine Learning Software: Why? How? What to Do?}, url={https://doi.org/10.1145/3468264.3468537}, DOI={10.1145/3468264.3468537}, abstractNote={Increasingly, software is making autonomous decisions in case of criminal sentencing, approving credit cards, hiring employees, and so on. Some of these decisions show bias and adversely affect certain social groups (e.g. those defined by sex, race, age, marital status). Many prior works on bias mitigation take the following form: change the data or learners in multiple ways, then see if any of that improves fairness. Perhaps a better approach is to postulate root causes of bias and then applying some resolution strategy. This paper postulates that the root causes of bias are the prior decisions that affect- (a) what data was selected and (b) the labels assigned to those examples. Our Fair-SMOTE algorithm removes biased labels; and rebalances internal distributions such that based on sensitive attribute, examples are equal in both positive and negative classes. On testing, it was seen that this method was just as effective at reducing bias as prior approaches. Further, models generated via Fair-SMOTE achieve higher performance (measured in terms of recall and F1) than other state-of-the-art fairness improvement algorithms. To the best of our knowledge, measured in terms of number of analyzed learners and datasets, this study is one of the largest studies on bias mitigation yet presented in the literature.}, journal={PROCEEDINGS OF THE 29TH ACM JOINT MEETING ON EUROPEAN SOFTWARE ENGINEERING CONFERENCE AND SYMPOSIUM ON THE FOUNDATIONS OF SOFTWARE ENGINEERING (ESEC/FSE '21)}, author={Chakraborty, Joymallya and Majumder, Suvodeep and Menzies, Tim}, year={2021}, pages={429–440} } @article{shrikanth_majumder_menzies_2021, title={Early Life Cycle Software Defect Prediction. Why? How?}, ISSN={["0270-5257"]}, DOI={10.1109/ICSE43902.2021.00050}, abstractNote={Many researchers assume that, for software analytics, "more data is better." We write to show that, at least for learning defect predictors, this may not be true. To demonstrate this, we analyzed hundreds of popular GitHub projects. These projects ran for 84 months and contained 3,728 commits (median values). Across these projects, most of the defects occur very early in their life cycle. Hence, defect predictors learned from the first 150 commits and four months perform just as well as anything else. This means that, at least for the projects studied here, after the first few months, we need not continually update our defect prediction models. We hope these results inspire other researchers to adopt a "simplicity-first" approach to their work. Some domains require a complex and data-hungry analysis. But before assuming complexity, it is prudent to check the raw data looking for "short cuts" that can simplify the analysis.}, journal={2021 IEEE/ACM 43RD INTERNATIONAL CONFERENCE ON SOFTWARE ENGINEERING (ICSE 2021)}, author={Shrikanth, N. C. and Majumder, Suvodeep and Menzies, Tim}, year={2021}, pages={448–459} }