@article{yedida_kang_tu_yang_lo_menzies_2023, title={How to Find Actionable Static Analysis Warnings: A Case Study With FindBugs}, volume={49}, ISSN={["1939-3520"]}, url={https://doi.org/10.1109/TSE.2023.3234206}, DOI={10.1109/TSE.2023.3234206}, abstractNote={Automatically generated static code warnings suffer from a large number of false alarms. Hence, developers only take action on a small percent of those warnings. To better predict which static code warnings should not be ignored, we suggest that analysts need to look deeper into their algorithms to find choices that better improve the particulars of their specific problem. Specifically, we show here that effective predictors of such warnings can be created by methods that locally adjust the decision boundary (between actionable warnings and others). These methods yield a new high water-mark for recognizing actionable static code warnings. For eight open-source Java projects (cassandra, jmeter, commons, lucene-solr, maven, ant, tomcat, derby) we achieve perfect test results on 4/8 datasets and, overall, a median AUC (area under the true negatives, true positives curve) of 92%.}, number={4}, journal={IEEE TRANSACTIONS ON SOFTWARE ENGINEERING}, author={Yedida, Rahul and Kang, Hong Jin and Tu, Huy and Yang, Xueqi and Lo, David and Menzies, Tim}, year={2023}, month={Apr}, pages={2856–2872} } @article{yang_menzies_2021, title={Documenting Evidence of a Replication of 'Analyze This! 145 Questions for Data Scientists in Software Engineering'}, url={https://doi.org/10.1145/3468264.3477219}, DOI={10.1145/3468264.3477219}, abstractNote={We report here the use of the 145 software engineering questions for data scientists presented in the Microsoft study in a recent FSE~'20 paper by Huijgens et al. The study by Begel et al. was replicated by Huijgens et al.}, journal={PROCEEDINGS OF THE 29TH ACM JOINT MEETING ON EUROPEAN SOFTWARE ENGINEERING CONFERENCE AND SYMPOSIUM ON THE FOUNDATIONS OF SOFTWARE ENGINEERING (ESEC/FSE '21)}, author={Yang, Xueqi and Menzies, Tim}, year={2021}, pages={1602–1602} } @article{yang_menzies_2021, title={Documenting Evidence of a Replication of 'Populating a Release History Database from Version Control and Bug Tracking Systems'}, url={https://doi.org/10.1145/3468264.3477218}, DOI={10.1145/3468264.3477218}, abstractNote={We report here the use of a keyword-based and regular expression-based approach to identify bug-fixing commits by linking commit messages and issue tracker data in a recent FSE '20 paper by Penta et al. in their paper "On the Relationship between Refactoring Actions and Bugs: A Differentiated Replication". The approach replicated is a keyword-based and regular expression-based approach as studied by Fischer et al.}, journal={PROCEEDINGS OF THE 29TH ACM JOINT MEETING ON EUROPEAN SOFTWARE ENGINEERING CONFERENCE AND SYMPOSIUM ON THE FOUNDATIONS OF SOFTWARE ENGINEERING (ESEC/FSE '21)}, author={Yang, Xueqi and Menzies, Tim}, year={2021}, pages={1601–1601} } @article{yang_menzies_2021, title={Documenting Evidence of a Reproduction of Is There A "Golden" Feature Set for Static Warning Identification? - An Experimental Evaluation'}, url={https://doi.org/10.1145/3468264.3477220}, DOI={10.1145/3468264.3477220}, abstractNote={We report here the use of the static analysis dataset generated by FindBugs in a recent EMSE '21 paper by Yang et al. The artifact reproduced is supervised models to perform static analysis based on a golden feature set as studied by Wang et al.}, journal={PROCEEDINGS OF THE 29TH ACM JOINT MEETING ON EUROPEAN SOFTWARE ENGINEERING CONFERENCE AND SYMPOSIUM ON THE FOUNDATIONS OF SOFTWARE ENGINEERING (ESEC/FSE '21)}, author={Yang, Xueqi and Menzies, Tim}, year={2021}, pages={1603–1603} } @article{yang_chen_yedida_yu_menzies_2021, title={Learning to recognize actionable static code warnings (is intrinsically easy)}, volume={26}, ISSN={["1573-7616"]}, url={https://doi.org/10.1007/s10664-021-09948-6}, DOI={10.1007/s10664-021-09948-6}, abstractNote={Static code warning tools often generate warnings that programmers ignore. Such tools can be made more useful via data mining algorithms that select the “actionable” warnings; i.e. the warnings that are usually not ignored. In this paper, we look for actionable warnings within a sample of 5,675 actionable warnings seen in 31,058 static code warnings from FindBugs. We find that data mining algorithms can find actionable warnings with remarkable ease. Specifically, a range of data mining methods (deep learners, random forests, decision tree learners, and support vector machines) all achieved very good results (recalls and AUC(TRN, TPR) measures usually over 95% and false alarms usually under 5%). Given that all these learners succeeded so easily, it is appropriate to ask if there is something about this task that is inherently easy. We report that while our data sets have up to 58 raw features, those features can be approximated by less than two underlying dimensions. For such intrinsically simple data, many different kinds of learners can generate useful models with similar performance. Based on the above, we conclude that learning to recognize actionable static code warnings is easy, using a wide range of learning algorithms, since the underlying data is intrinsically simple. If we had to pick one particular learner for this task, we would suggest linear SVMs (since, at least in our sample, that learner ran relatively quickly and achieved the best median performance) and we would not recommend deep learning (since this data is intrinsically very simple).}, number={3}, journal={EMPIRICAL SOFTWARE ENGINEERING}, publisher={Springer Science and Business Media LLC}, author={Yang, Xueqi and Chen, Jianfeng and Yedida, Rahul and Yu, Zhe and Menzies, Tim}, year={2021}, month={May} } @article{agrawal_yang_agrawal_yedida_shen_menzies_2021, title={Simpler Hyperparameter Optimization for Software Analytics: Why, How, When}, volume={48}, ISSN={0098-5589 1939-3520 2326-3881}, url={http://dx.doi.org/10.1109/TSE.2021.3073242}, DOI={10.1109/TSE.2021.3073242}, abstractNote={How can we make software analytics simpler and faster? One method is to match the complexity of analysis to the intrinsic complexity of the data being explored. For example, hyperparameter optimizers find the control settings for data miners that improve the predictions generated via software analytics. Sometimes, very fast hyperparameter optimization can be achieved by “DODGE-ing”; i.e., simply steering way from settings that lead to similar conclusions. But when is it wise to use that simple approach and when must we use more complex (and much slower) optimizers? To answer this, we applied hyperparameter optimization to 120 SE data sets that explored bad smell detection, predicting Github issue close time, bug report analysis, defect prediction, and dozens of other non-SE problems. We find that the simple DODGE works best for data sets with low “intrinsic dimensionality” ($\mu _D\approx 3$μD3) and very poorly for higher-dimensional data ($\mu _D > 8$μD>8). Nearly all the SE data seen here was intrinsically low-dimensional, indicating that DODGE is applicable for many SE analytics tasks.}, number={8}, journal={IEEE Transactions on Software Engineering}, publisher={Institute of Electrical and Electronics Engineers (IEEE)}, author={Agrawal, Amritanshu and Yang, Xueqi and Agrawal, Rishabh and Yedida, Rahul and Shen, Xipeng and Menzies, Tim}, year={2021}, pages={1–1} } @article{yang_yu_wang_menzies_2021, title={Understanding static code warnings: An incremental AI approach}, volume={167}, ISSN={["1873-6793"]}, url={https://doi.org/10.1016/j.eswa.2020.114134}, DOI={10.1016/j.eswa.2020.114134}, abstractNote={Knowledge-based systems reason over some knowledge base. Hence, an important issue for such systems is how to acquire the knowledge needed for their inference. This paper assesses active learning methods for acquiring knowledge for “static code warnings”. Static code analysis is a widely-used method for detecting bugs and security vulnerabilities in software systems. As software becomes more complex, analysis tools also report lists of increasingly complex warnings that developers need to address on a daily basis. Such static code analysis tools are usually over-cautious; i.e. they often offer many warnings about spurious issues. Previous research work shows that about 35% to 91 % warnings reported as bugs by SA tools are actually unactionable (i.e., warnings that would not be acted on by developers because they are falsely suggested as bugs). Experienced developers know which errors are important and which can be safely ignored. How can we capture that experience? This paper reports on an incremental AI tool that watches humans reading false alarm reports. Using an incremental support vector machine mechanism, this AI tool can quickly learn to distinguish spurious false alarms from more serious matters that deserve further attention. In this work, nine open-source projects are employed to evaluate our proposed model on the features extracted by previous researchers and identify the actionable warnings in a priority order given by our algorithm. We observe that our model can identify over 90% of actionable warnings when our methods tell humans to ignore 70 to 80% of the warnings.}, journal={EXPERT SYSTEMS WITH APPLICATIONS}, author={Yang, Xueqi and Yu, Zhe and Wang, Junjie and Menzies, Tim}, year={2021}, month={Apr} }