@article{majumder_chakraborty_menzies_2024, title={When less is more: on the value of "co-training" for semi-supervised software defect predictors}, volume={29}, ISSN={["1573-7616"]}, DOI={10.1007/s10664-023-10418-4}, number={2}, journal={EMPIRICAL SOFTWARE ENGINEERING}, author={Majumder, Suvodeep and Chakraborty, Joymallya and Menzies, Tim}, year={2024}, month={Mar} } @article{majumder_chakraborty_bai_stolee_menzies_2023, title={Fair Enough: Searching for Sufficient Measures of Fairness}, volume={32}, ISSN={["1557-7392"]}, url={https://doi.org/10.1145/3585006}, DOI={10.1145/3585006}, abstractNote={Testing machine learning software for ethical bias has become a pressing current concern. In response, recent research has proposed a plethora of new fairness metrics, for example, the dozens of fairness metrics in the IBM AIF360 toolkit. This raises the question: How can any fairness tool satisfy such a diverse range of goals? While we cannot completely simplify the task of fairness testing, we can certainly reduce the problem. This article shows that many of those fairness metrics effectively measure the same thing. Based on experiments using seven real-world datasets, we find that (a) 26 classification metrics can be clustered into seven groups and (b) four dataset metrics can be clustered into three groups. Further, each reduced set may actually predict different things. Hence, it is no longer necessary (or even possible) to satisfy all fairness metrics. In summary, to simplify the fairness testing problem, we recommend the following steps: (1) determine what type of fairness is desirable (and we offer a handful of such types), then (2) lookup those types in our clusters, and then (3) just test for one item per cluster.}, number={6}, journal={ACM TRANSACTIONS ON SOFTWARE ENGINEERING AND METHODOLOGY}, author={Majumder, Suvodeep and Chakraborty, Joymallya and Bai, Gina R. and Stolee, Kathryn T. and Menzies, Tim}, year={2023}, month={Nov} } @article{peng_chakraborty_menzies_2023, title={FairMask: Better Fairness via Model-Based Rebalancing of Protected Attributes}, volume={49}, ISSN={["1939-3520"]}, url={https://doi.org/10.1109/TSE.2022.3220713}, DOI={10.1109/TSE.2022.3220713}, abstractNote={Context: Machine learning software can generate models that inappropriately discriminate against specific protected social groups (e.g., groups based on gender, ethnicity, etc.). Motivated by those results, software engineering researchers have proposed many methods for mitigating those discriminatory effects. While those methods are effective in mitigating bias, few of them can provide explanations on what is the root cause of bias. Objective: We aim to better detect and mitigate algorithmic discrimination in machine learning software problems. Method: Here we propose ${{\sf FairMask}}$FairMask, a model-based extrapolation method that is capable of both mitigating bias and explaining the cause. In our ${{\sf FairMask}}$FairMask approach, protected attributes are represented by models learned from the other independent variables (and these models offer extrapolations over the space between existing examples). We then use the extrapolation models to relabel protected attributes later seen in testing data or deployment time. Our approach aims to offset the biased predictions of the classification model by rebalancing the distribution of protected attributes. Results: The experiments of this paper show that, without compromising (original) model performance, ${{\sf FairMask}}$FairMask can achieve significantly better group and individual fairness (as measured in different metrics) than benchmark methods. Moreover, compared to another instance-based rebalancing method, our model-based approach shows faster runtime and thus better scalability. Conclusion: Algorithmic decision bias can be removed via extrapolation that corrects the misleading latent correlation between the protected attributes and other non-protected ones. As evidence for this, our proposed ${{\sf FairMask}}$FairMask is not only performance-wise better (measured by fairness and performance metrics) than two state-of-the-art fairness algorithms. Reproduction Package: In order to better support open science, all scripts and data used in this study are available online at https://github.com/anonymous12138/biasmitigation.}, number={4}, journal={IEEE TRANSACTIONS ON SOFTWARE ENGINEERING}, author={Peng, Kewen and Chakraborty, Joymallya and Menzies, Tim}, year={2023}, month={Apr}, pages={2426–2439} } @article{chakraborty_majumder_tu_2022, title={Fair-SSL: Building fair ML Software with less data}, DOI={10.1145/3524491.3527305}, abstractNote={Ethical bias in machine learning models has become a matter of concern in the software engineering community. Most of the prior software engineering works concentrated on finding ethical bias in models rather than fixing it. After finding bias, the next step is mitigation. Prior researchers mainly tried to use supervised approaches to achieve fairness. However, in the real world, getting data with trustworthy ground truth is challenging and also ground truth can contain human bias. Semi-supervised learning is a technique where, incrementally, labeled data is used to generate pseudo-labels for the rest of data (and then all that data is used for model training). In this work, we apply four popular semi-supervised techniques as pseudo-labelers to create fair classification models. Our framework, Fair-SSL, takes a very small amount (10%) of labeled data as input and generates pseudo-labels for the unlabeled data. We then synthetically generate new data points to balance the training data based on class and protected attribute as proposed by Chakraborty et al. in FSE 2021. Finally, classification model is trained on the balanced pseudo-labeled data and validated on test data. After experimenting on ten datasets and three learners, we find that Fair-SSL achieves similar performance as three state-of-the-art bias mitigation algorithms. That said, the clear advantage of Fair-SSL is that it requires only 10% of the labeled training data. To the best of our knowledge, this is the first SE work where semi-supervised techniques are used to fight against ethical bias in SE ML models. To facilitate open science and replication, all our source code and datasets are publicly available at https://github.com/joymallyac/FairSSL. CCS CONCEPTS • Software and its engineering → Software creation and management; • Computing methodologies → Machine learning. ACM Reference Format: Joymallya Chakraborty, Suvodeep Majumder, and Huy Tu. 2022. Fair-SSL: Building fair ML Software with less data. In International Workshop on Equitable Data and Technology (FairWare ‘22), May 9, 2022, Pittsburgh, PA, USA. ACM, New York, NY, USA, 8 pages. https://doi.org/10.1145/3524491.3527305}, journal={2022 IEEE/ACM INTERNATIONAL WORKSHOP ON EQUITABLE DATA & TECHNOLOGY (FAIRWARE 2022)}, author={Chakraborty, Joymallya and Majumder, Suvodeep and Tu, Huy}, year={2022}, pages={1–8} } @article{chakraborty_majumder_menzies_2021, title={Bias in Machine Learning Software: Why? How? What to Do?}, url={https://doi.org/10.1145/3468264.3468537}, DOI={10.1145/3468264.3468537}, abstractNote={Increasingly, software is making autonomous decisions in case of criminal sentencing, approving credit cards, hiring employees, and so on. Some of these decisions show bias and adversely affect certain social groups (e.g. those defined by sex, race, age, marital status). Many prior works on bias mitigation take the following form: change the data or learners in multiple ways, then see if any of that improves fairness. Perhaps a better approach is to postulate root causes of bias and then applying some resolution strategy. This paper postulates that the root causes of bias are the prior decisions that affect- (a) what data was selected and (b) the labels assigned to those examples. Our Fair-SMOTE algorithm removes biased labels; and rebalances internal distributions such that based on sensitive attribute, examples are equal in both positive and negative classes. On testing, it was seen that this method was just as effective at reducing bias as prior approaches. Further, models generated via Fair-SMOTE achieve higher performance (measured in terms of recall and F1) than other state-of-the-art fairness improvement algorithms. To the best of our knowledge, measured in terms of number of analyzed learners and datasets, this study is one of the largest studies on bias mitigation yet presented in the literature.}, journal={PROCEEDINGS OF THE 29TH ACM JOINT MEETING ON EUROPEAN SOFTWARE ENGINEERING CONFERENCE AND SYMPOSIUM ON THE FOUNDATIONS OF SOFTWARE ENGINEERING (ESEC/FSE '21)}, author={Chakraborty, Joymallya and Majumder, Suvodeep and Menzies, Tim}, year={2021}, pages={429–440} } @article{chakraborty_peng_menzies_2020, title={Making Fair ML Software using Trustworthy Explanation}, ISSN={["1527-1366"]}, DOI={10.1145/3324884.3418932}, abstractNote={Machine learning software is being used in many applications (finance, hiring, admissions, criminal justice) having huge social impact. But sometimes the behavior of this software is biased and it shows discrimination based on some sensitive attributes such as sex, race etc. Prior works concentrated on finding and mitigating bias in ML models. A recent trend is using instance-based model-agnostic explanation methods such as LIME[36] to find out bias in the model prediction. Our work concentrates on finding shortcomings of current bias measures and explanation methods. We show how our proposed method based on K nearest neighbors can overcome those shortcomings and find the underlying bias of black box models. Our results are more trustworthy and helpful for the practitioners. Finally, We describe our future framework combining explanation and planning to build fair software.}, journal={2020 35TH IEEE/ACM INTERNATIONAL CONFERENCE ON AUTOMATED SOFTWARE ENGINEERING (ASE 2020)}, author={Chakraborty, Joymallya and Peng, Kewen and Menzies, Tim}, year={2020}, pages={1229–1233} } @article{imtiaz_middleton_chakraborty_robson_bai_murphy-hill_2019, title={Investigating the Effects of Gender Bias on GitHub}, ISSN={["0270-5257"]}, DOI={10.1109/ICSE.2019.00079}, abstractNote={Diversity, including gender diversity, is valued by many software development organizations, yet the field remains dominated by men. One reason for this lack of diversity is gender bias. In this paper, we study the effects of that bias by using an existing framework derived from the gender studies literature.We adapt the four main effects proposed in the framework by posing hypotheses about how they might manifest on GitHub,then evaluate those hypotheses quantitatively. While our results how that effects of gender bias are largely invisible on the GitHub platform itself, there are still signals of women concentrating their work in fewer places and being more restrained in communication than men.}, journal={2019 IEEE/ACM 41ST INTERNATIONAL CONFERENCE ON SOFTWARE ENGINEERING (ICSE 2019)}, author={Imtiaz, Nasif and Middleton, Justin and Chakraborty, Joymallya and Robson, Neill and Bai, Gina and Murphy-Hill, Emerson}, year={2019}, pages={700–711} } @article{chen_chakraborty_clark_haverlock_cherian_menzies_2019, title={Predicting Breakdowns in Cloud Services (with SPIKE)}, DOI={10.1145/3338906.3340450}, abstractNote={Maintaining web-services is a mission-critical task where any down- time means loss of revenue and reputation (of being a reliable service provider). In the current competitive web services market, such a loss of reputation causes extensive loss of future revenue. To address this issue, we developed SPIKE, a data mining tool which can predict upcoming service breakdowns, half an hour into the future. Such predictions let an organization alert and assemble the tiger team to address the problem (e.g. by reconguring cloud hardware in order to reduce the likelihood of that breakdown). SPIKE utilizes (a) regression tree learning (with CART); (b) synthetic minority over-sampling (to handle how rare spikes are in our data); (c) hyperparameter optimization (to learn best settings for our local data) and (d) a technique we called “topology sampling” where training vectors are built from extensive details of an individual node plus summary details on all their neighbors. In the experiments reported here, SPIKE predicted service spikes 30 minutes into future with recalls and precision of 75% and above. Also, SPIKE performed relatively better than other widely-used learning methods (neural nets, random forests, logistic regression).}, journal={ESEC/FSE'2019: PROCEEDINGS OF THE 2019 27TH ACM JOINT MEETING ON EUROPEAN SOFTWARE ENGINEERING CONFERENCE AND SYMPOSIUM ON THE FOUNDATIONS OF SOFTWARE ENGINEERING}, author={Chen, Jianfeng and Chakraborty, Joymallya and Clark, Philip and Haverlock, Kevin and Cherian, Snehit and Menzies, Tim}, year={2019}, pages={916–924} }