@article{mathew_agrawal_menzies_2023, title={Finding Trends in Software Research}, volume={49}, ISSN={["1939-3520"]}, url={https://doi.org/10.1109/TSE.2018.2870388}, DOI={10.1109/TSE.2018.2870388}, abstractNote={Text mining methods can find large scale trends within research communities. For example, using stable Latent Dirichlet Allocation (a topic modeling algorithm) this study found 10 major topics in 35,391 SE research papers from 34 leading SE venues over the last 25 years (divided, evenly, between conferences and journals). Out study also shows how those topics have changed over recent years. Also, we note that (in the historical record) mono-focusing on a single topic can lead to fewer citations than otherwise. Further, while we find no overall gender bias in SE authorship, we note that women are under-represented in the top-most cited papers in our field. Lastly, we show a previously unreported dichotomy between software conferences and journals (so research topics that succeed at conferences might not succeed at journals, and vice versa). An important aspect of this work is that it is automatic and quickly repeatable (unlike prior SE bibliometric studies that used tediously slow and labor intensive methods). Automation is important since, like any data mining study, its conclusions are skewed by the data used in the analysis. The automatic methods of this paper make it far easier for other researchers to re-apply the analysis to new data, or if they want to use different modeling assumptions.}, number={4}, journal={IEEE TRANSACTIONS ON SOFTWARE ENGINEERING}, author={Mathew, George and Agrawal, Amritanshu and Menzies, Tim}, year={2023}, month={Apr}, pages={1397–1410} } @article{agrawal_yang_agrawal_yedida_shen_menzies_2021, title={Simpler Hyperparameter Optimization for Software Analytics: Why, How, When}, volume={48}, ISSN={0098-5589 1939-3520 2326-3881}, url={http://dx.doi.org/10.1109/TSE.2021.3073242}, DOI={10.1109/TSE.2021.3073242}, abstractNote={How can we make software analytics simpler and faster? One method is to match the complexity of analysis to the intrinsic complexity of the data being explored. For example, hyperparameter optimizers find the control settings for data miners that improve the predictions generated via software analytics. Sometimes, very fast hyperparameter optimization can be achieved by “DODGE-ing”; i.e., simply steering way from settings that lead to similar conclusions. But when is it wise to use that simple approach and when must we use more complex (and much slower) optimizers? To answer this, we applied hyperparameter optimization to 120 SE data sets that explored bad smell detection, predicting Github issue close time, bug report analysis, defect prediction, and dozens of other non-SE problems. We find that the simple DODGE works best for data sets with low “intrinsic dimensionality” ($\mu _D\approx 3$μD3) and very poorly for higher-dimensional data ($\mu _D > 8$μD>8). Nearly all the SE data seen here was intrinsically low-dimensional, indicating that DODGE is applicable for many SE analytics tasks.}, number={8}, journal={IEEE Transactions on Software Engineering}, publisher={Institute of Electrical and Electronics Engineers (IEEE)}, author={Agrawal, Amritanshu and Yang, Xueqi and Agrawal, Rishabh and Yedida, Rahul and Shen, Xipeng and Menzies, Tim}, year={2021}, pages={1–1} } @article{agrawal_menzies_minku_wagner_yu_2020, title={Better software analytics via "DUO": Data mining algorithms using/used-by optimizers}, volume={25}, ISSN={["1573-7616"]}, url={https://doi.org/10.1007/s10664-020-09808-9}, DOI={10.1007/s10664-020-09808-9}, abstractNote={This paper claims that a new field of empirical software engineering research and practice is emerging: data mining using/used-by optimizers for empirical studies, or DUO. For example, data miners can generate models that are explored by optimizers. Also, optimizers can advise how to best adjust the control parameters of a data miner. This combined approach acts like an agent leaning over the shoulder of an analyst that advises "ask this question next" or "ignore that problem, it is not relevant to your goals". Further, those agents can help us build "better" predictive models, where "better" can be either greater predictive accuracy or faster modeling time (which, in turn, enables the exploration of a wider range of options). We also caution that the era of papers that just use data miners is coming to an end. Results obtained from an unoptimized data miner can be quickly refuted, just by applying an optimizer to produce a different (and better performing) model. Our conclusion, hence, is that for software analytics it is possible, useful and necessary to combine data mining and optimization using DUO.}, number={3}, journal={EMPIRICAL SOFTWARE ENGINEERING}, publisher={Springer Science and Business Media LLC}, author={Agrawal, Amritanshu and Menzies, Tim and Minku, Leandro L. and Wagner, Markus and Yu, Zhe}, year={2020}, month={May}, pages={2099–2136} } @article{rahman_agrawal_krishna_sobran_2018, title={Characterizing the Influence of Continuous Integration}, DOI={10.1145/3278142.3278149}, abstractNote={Continuous integration (CI) tools integrate code changes by automatically compiling, building, and executing test cases upon submission of code changes. Use of CI tools is getting increasingly popular, yet how proprietary projects reap the benefits of CI remains unknown. To investigate the influence of CI on software development, we analyze 150 open source software (OSS) projects, and 123 proprietary projects. For OSS projects, we observe the expected benefits after CI adoption, e.g., improvements in bug and issue resolution. However, for the proprietary projects, we cannot make similar observations. Our findings indicate that only adoption of CI might not be enough to the improve software development process. CI can be effective for software development if practitioners use CI's feedback mechanism efficiently, by applying the practice of making frequent commits. For our set of proprietary projects we observe practitioners commit less frequently, and hence not use CI effectively for obtaining feedback on the submitted code changes. Based on our findings we recommend industry practitioners to adopt the best practices of CI to reap the benefits of CI tools for example, making frequent commits.}, journal={PROCEEDINGS OF THE 4TH ACM SIGSOFT INTERNATIONAL WORKSHOP ON SOFTWARE ANALYTICS (SWAN'18)}, author={Rahman, Akond and Agrawal, Amritanshu and Krishna, Rahul and Sobran, Alexander}, year={2018}, pages={8–14} } @article{nair_agrawal_chen_fu_mathew_menzies_minku_wagner_yu_2018, title={Data-Driven Search-based Software Engineering}, ISSN={["2160-1852"]}, DOI={10.1145/3196398.3196442}, abstractNote={This paper introduces Data-Driven Search-based Software Engineering (DSE), which combines insights from Mining Software Repositories (MSR) and Search-based Software Engineering (SBSE). While MSR formulates software engineering problems as data mining problems, SBSE reformulates Software Engineering (SE) problems as optimization problems and use meta-heuristic algorithms to solve them. Both MSR and SBSE share the common goal of providing insights to improve software engineering. The algorithms used in these two areas also have intrinsic relationships. We, therefore, argue that combining these two fields is useful for situations (a)~which require learning from a large data source or (b)~when optimizers need to know the lay of the land to find better solutions, faster. This paper aims to answer the following three questions: (1) What are the various topics addressed by DSE?, (2) What types of data are used by the researchers in this area?, and (3) What research approaches do researchers use? The paper briefly sets out to act as a practical guide to develop new DSE techniques and also to serve as a teaching resource. This paper also presents a resource (tiny.cc/data-se) for exploring DSE. The resource contains 89 artifacts which are related to DSE, divided into 13 groups such as requirements engineering, software product lines, software processes. All the materials in this repository have been used in recent software engineering papers; i.e., for all this material, there exist baseline results against which researchers can comparatively assess their new ideas.}, journal={2018 IEEE/ACM 15TH INTERNATIONAL CONFERENCE ON MINING SOFTWARE REPOSITORIES (MSR)}, author={Nair, Vivek and Agrawal, Amritanshu and Chen, Jianfeng and Fu, Wei and Mathew, George and Menzies, Tim and Minku, Leandro and Wagner, Markus and Yu, Zhe}, year={2018}, pages={341–352} } @article{agrawal_menzies_2018, title={Is "Better Data" Better Than "Better Data Miners"? On the Benefits of Tuning SMOTE for Defect Prediction}, DOI={10.1145/3180155.3180197}, abstractNote={We report and fix an important systematic error in prior studies that ranked classifiers for software analytics. Those studies did not (a) assess classifiers on multiple criteria and they did not (b) study how variations in the data affect the results. Hence, this paper applies (a) multi-performance criteria while (b) fixing the weaker regions of the training data (using SMOTUNED, which is an auto-tuning version of SMOTE). This approach leads to dramatically large increases in software defect predictions when applied in a 5*5 cross-validation study for 3,681 JAVA classes (containing over a million lines of code) from open source systems, SMOTUNED increased AUC and recall by 60% and 20% respectively. These improvements are independent of the classifier used to predict for defects. Same kind of pattern (improvement) was observed when a comparative analysis of SMOTE and SMOTUNED was done against the most recent class imbalance technique. In conclusion, for software analytic tasks like defect prediction, (1) data pre-processing can be more important than classifier choice, (2) ranking studies are incomplete without such pre-processing, and (3) SMOTUNED is a promising candidate for pre-processing.}, journal={PROCEEDINGS 2018 IEEE/ACM 40TH INTERNATIONAL CONFERENCE ON SOFTWARE ENGINEERING (ICSE)}, author={Agrawal, Amritanshu and Menzies, Tim}, year={2018}, pages={1050–1061} } @article{agrawal_fu_menzies_2018, title={What is wrong with topic modeling? And how to fix it using search-based software engineering}, volume={98}, ISSN={["1873-6025"]}, url={https://doi.org/10.1016/j.infsof.2018.02.005}, DOI={10.1016/j.infsof.2018.02.005}, abstractNote={Topic modeling finds human-readable structures in unstructured textual data. A widely used topic modeling technique is Latent Dirichlet allocation. When running on different datasets, LDA suffers from “order effects”, i.e., different topics are generated if the order of training data is shuffled. Such order effects introduce a systematic error for any study. This error can relate to misleading results; specifically, inaccurate topic descriptions and a reduction in the efficacy of text mining classification results. To provide a method in which distributions generated by LDA are more stable and can be used for further analysis. We use LDADE, a search-based software engineering tool which uses Differential Evolution (DE) to tune the LDA’s parameters. LDADE is evaluated on data from a programmer information exchange site (Stackoverflow), title and abstract text of thousands of Software Engineering (SE) papers, and software defect reports from NASA. Results were collected across different implementations of LDA (Python+Scikit-Learn, Scala+Spark) across Linux platform and for different kinds of LDAs (VEM, Gibbs sampling). Results were scored via topic stability and text mining classification accuracy. In all treatments: (i) standard LDA exhibits very large topic instability; (ii) LDADE’s tunings dramatically reduce cluster instability; (iii) LDADE also leads to improved performances for supervised as well as unsupervised learning. Due to topic instability, using standard LDA with its “off-the-shelf” settings should now be depreciated. Also, in future, we should require SE papers that use LDA to test and (if needed) mitigate LDA topic instability. Finally, LDADE is a candidate technology for effectively and efficiently reducing that instability.}, journal={INFORMATION AND SOFTWARE TECHNOLOGY}, publisher={Elsevier BV}, author={Agrawal, Amritanshu and Fu, Wei and Menzies, Tim}, year={2018}, month={Jun}, pages={74–88} }