@article{johnson_menzies_2024, title={Ethics: Why Software Engineers Can't Afford to Look Away}, volume={41}, ISSN={["1937-4194"]}, url={https://doi.org/10.1109/MS.2023.3319768}, DOI={10.1109/MS.2023.3319768}, abstractNote={Some people shy away from discussing ethics, believing it's not in the domain of software engineering. We want to steer the conversation in the opposite direction, and this column explains that such ethics-based discussions are crucial to our profession.}, number={1}, journal={IEEE SOFTWARE}, author={Johnson, Brittany and Menzies, Tim}, year={2024}, month={Jan}, pages={142–144} } @article{johnson_menzies_2024, title={Fighting for What's Right: An Interview With Marc Canellas}, volume={41}, ISSN={["1937-4194"]}, url={https://doi.org/10.1109/MS.2023.3340928}, DOI={10.1109/MS.2023.3340928}, number={2}, journal={IEEE SOFTWARE}, author={Johnson, Brittany and Menzies, Tim}, year={2024}, pages={104–107} } @article{lustosa_menzies_2024, title={Learning from Very Little Data: On the Value of Landscape Analysis for Predicting Software Project Health}, url={https://doi.org/10.1145/3630252}, DOI={10.1145/3630252}, abstractNote={When data is scarce, software analytics can make many mistakes. For example, consider learning predictors for open source project health (e.g., the number of closed pull requests in 12 months time). The training data for this task may be very small (e.g., 5 years of data, collected every month means just 60 rows of training data). The models generated from such tiny datasets can make many prediction errors.}, journal={ACM Transactions on Software Engineering and Methodology}, author={Lustosa, Andre and Menzies, Tim}, year={2024}, month={Mar} } @article{johnson_menzies_2024, title={The Power of Positionality-Why Accessibility? An Interview With Kevin Moran and Arun Krishnavajjala}, volume={41}, ISSN={["1937-4194"]}, url={https://doi.org/10.1109/MS.2024.3360650}, DOI={10.1109/MS.2024.3360650}, abstractNote={WHAT IF WE could create somethingwhere developers can make their applications accessible from the start, where they can still innovate and make cool apps but also make them accessible at the same time?This is the shared vision of third-year Ph.D. student Arun Krishnavajjala and his advisor Dr.}, number={3}, journal={IEEE SOFTWARE}, author={Johnson, Brittany and Menzies, Tim}, year={2024}, pages={91–94} } @article{ling_menzies_hazard_shu_beel_2024, title={Trading Off Scalability, Privacy, and Performance in Data Synthesis}, volume={12}, ISSN={["2169-3536"]}, url={https://doi.org/10.1109/ACCESS.2024.3366556}, DOI={10.1109/ACCESS.2024.3366556}, abstractNote={Synthetic data has been widely applied in the real world recently. One typical example is the creation of synthetic data for privacy concerned datasets. In this scenario, synthetic data substitute the real data which contains the privacy information, and is used to public testing for machine learning models. Another typical example is the unbalance data over-sampling which the synthetic data is generated in the region of minority samples to balance the positive and negative ratio when training the machine learning models. In this study, we concentrate on the first example, and introduce (a) the Howso engine, and (b) our proposed random projection based synthetic data generation framework. We evaluate these two algorithms on the aspects of privacy preservation and accuracy, and compare them to the two state-of-the-art synthetic data generation algorithms DataSynthesizer and Synthetic Data Vault. We show that the synthetic data generated by Howso engine has good privacy and accuracy, which results in the best overall score. On the other hand, our proposed random projection based framework can generate synthetic data with highest accuracy score, and has the fastest scalability.}, journal={IEEE ACCESS}, author={Ling, Xiao and Menzies, Tim and Hazard, Christopher and Shu, Jack and Beel, Jacob}, year={2024}, pages={26642–26654} } @article{majumder_chakraborty_menzies_2024, title={When less is more: on the value of "co-training" for semi-supervised software defect predictors}, volume={29}, ISSN={["1573-7616"]}, DOI={10.1007/s10664-023-10418-4}, number={2}, journal={EMPIRICAL SOFTWARE ENGINEERING}, author={Majumder, Suvodeep and Chakraborty, Joymallya and Menzies, Tim}, year={2024}, month={Mar} } @article{menzies_hazard_2023, title={"The Best Data Are Fake Data?": An Interview With Chris Hazard}, volume={40}, ISSN={["1937-4194"]}, url={https://doi.org/10.1109/MS.2023.3286480}, DOI={10.1109/MS.2023.3286480}, abstractNote={In this issue, we interview Dr. Chris Hazard, cofounder of Diveplane, which is a leader in the burgeoning international synthetic data market. Dr. Hazard discusses the ethical implications of using synthetic data generated from real information sources.}, number={5}, journal={IEEE SOFTWARE}, author={Menzies, Tim and Hazard, Chris}, year={2023}, month={Sep}, pages={121–124} } @article{baldassarre_ernst_hermann_menzies_yedida_2023, title={(Re)Use of Research Results (Is Rampant)}, volume={66}, ISSN={["1557-7317"]}, url={http://www.scopus.com/inward/record.url?eid=2-s2.0-85147250416&partnerID=MN8TOARS}, DOI={10.1145/3554976}, abstractNote={Prior pessimism about reuse in software engineering research may have been a result of using the wrong methods to measure the wrong things.}, number={2}, journal={COMMUNICATIONS OF THE ACM}, author={Baldassarre, Maria Teresa and Ernst, Neil and Hermann, Ben and Menzies, Tim and Yedida, Rahul}, year={2023}, month={Feb}, pages={75–81} } @article{zhang_sun_xu_sui_bandara_chen_menzies_2023, title={A Tale of Two Cities: Data and Configuration Variances in Robust Deep Learning}, volume={27}, ISSN={["1941-0131"]}, url={https://doi.org/10.1109/MIC.2023.3322283}, DOI={10.1109/MIC.2023.3322283}, abstractNote={Deep neural networks (DNNs) have widespread applications in industries such as image recognition, supply chain, medical diagnosis, and autonomous driving. However, previous work has shown that the high accuracy of a DNN model does not imply high robustness (i.e., consistent performances on new and future datasets) because the input data and external environment (e.g., software and model configurations) for a deployed model are constantly changing. Therefore, ensuring robustness is crucial to enhance business and consumer confidence. Previous research focuses mostly on the data aspect of model variance. This article takes a holistic view of DNN robustness by summarizing the issues related to both data and software configuration variances. We also present a predictive framework using search-based optimization to generate representative variances for robust learning, considering data and configurations.}, number={6}, journal={IEEE INTERNET COMPUTING}, author={Zhang, Guanqin and Sun, Jiankun and Xu, Feng and Sui, Yulei and Bandara, H. M. N. Dilum and Chen, Shiping and Menzies, Tim}, year={2023}, month={Nov}, pages={13–20} } @article{yedida_krishna_kalia_menzies_xiao_vukovic_2023, title={An expert system for redesigning software for cloud applications}, volume={219}, ISSN={["1873-6793"]}, url={https://doi.org/10.1016/j.eswa.2023.119673}, DOI={10.1016/j.eswa.2023.119673}, abstractNote={Cloud-based software has many advantages. When services are divided into many independent components, they are easier to update. Also, during peak demand, it is easier to scale cloud services (just hire more CPUs). Hence, many organizations are partitioning their monolithic enterprise applications into cloud-based microservices. Recently there has been much work using machine learning to simplify this partitioning task. Despite much research, no single partitioning method can be recommended as generally useful. More specifically, those prior solutions are “brittle”; i.e. if they work well for one kind of goal in one dataset, then they can be sub-optimal if applied to many datasets and multiple goals. This work extends prior work and proposes DEEPLY to fix the brittleness problem. Specifically, we use (a) hyper-parameter optimization to sample from the Pareto frontier of configurations (b) a weighted loss to choose optimally from this Pareto frontier (c) the 1cycle learning rate policy to avoid local minima with Adam and (d) spectral clustering over k-means. Our work shows that DEEPLY outperforms other algorithms in this space across different metrics. Moreover, our ablation study reveals that of the changes, the weighted loss is the most important, followed by hyper-parameter optimization (contrary to prior belief). To enable the reuse of this research, DEEPLY is available on-line at .}, journal={EXPERT SYSTEMS WITH APPLICATIONS}, author={Yedida, Rahul and Krishna, Rahul and Kalia, Anup and Menzies, Tim and Xiao, Jin and Vukovic, Maja}, year={2023}, month={Jun} } @article{shrikanth_menzies_2023, title={Assessing the Early Bird Heuristic (for Predicting ProjectQuality)}, volume={32}, ISSN={["1557-7392"]}, url={https://doi.org/10.1145/3583565}, DOI={10.1145/3583565}, abstractNote={Before researchers rush to reason across all available data or try complex methods, perhaps it is prudent to first check for simpler alternatives. Specifically, if the historical data has the most information in some small region, then perhaps a model learned from that region would suffice for the rest of the project.}, number={5}, journal={ACM TRANSACTIONS ON SOFTWARE ENGINEERING AND METHODOLOGY}, author={Shrikanth, N. C. and Menzies, Tim}, year={2023}, month={Jul} } @article{alvarez_menzies_2023, title={Don't Lie to Me: Avoiding Malicious Explanations With STEALTH}, volume={40}, ISSN={["1937-4194"]}, url={https://doi.org/10.1109/MS.2023.3244713}, DOI={10.1109/MS.2023.3244713}, abstractNote={STEALTH is a method for using some artificial intelligence-generated models without suffering from malicious attacks or associated unfairness issues. STEALTH asks so few queries (one per data cluster) that malicious algorithms cannot detect its operation or know when to lie.}, number={3}, journal={IEEE SOFTWARE}, author={Alvarez, Lauren and Menzies, Tim}, year={2023}, pages={43–53} } @article{majumder_chakraborty_bai_stolee_menzies_2023, title={Fair Enough: Searching for Sufficient Measures of Fairness}, volume={32}, ISSN={["1557-7392"]}, url={https://doi.org/10.1145/3585006}, DOI={10.1145/3585006}, abstractNote={Testing machine learning software for ethical bias has become a pressing current concern. In response, recent research has proposed a plethora of new fairness metrics, for example, the dozens of fairness metrics in the IBM AIF360 toolkit. This raises the question: How can any fairness tool satisfy such a diverse range of goals? While we cannot completely simplify the task of fairness testing, we can certainly reduce the problem. This article shows that many of those fairness metrics effectively measure the same thing. Based on experiments using seven real-world datasets, we find that (a) 26 classification metrics can be clustered into seven groups and (b) four dataset metrics can be clustered into three groups. Further, each reduced set may actually predict different things. Hence, it is no longer necessary (or even possible) to satisfy all fairness metrics. In summary, to simplify the fairness testing problem, we recommend the following steps: (1) determine what type of fairness is desirable (and we offer a handful of such types), then (2) lookup those types in our clusters, and then (3) just test for one item per cluster.}, number={6}, journal={ACM TRANSACTIONS ON SOFTWARE ENGINEERING AND METHODOLOGY}, author={Majumder, Suvodeep and Chakraborty, Joymallya and Bai, Gina R. and Stolee, Kathryn T. and Menzies, Tim}, year={2023}, month={Nov} } @article{peng_chakraborty_menzies_2023, title={FairMask: Better Fairness via Model-Based Rebalancing of Protected Attributes}, volume={49}, ISSN={["1939-3520"]}, url={https://doi.org/10.1109/TSE.2022.3220713}, DOI={10.1109/TSE.2022.3220713}, abstractNote={Context: Machine learning software can generate models that inappropriately discriminate against specific protected social groups (e.g., groups based on gender, ethnicity, etc.). Motivated by those results, software engineering researchers have proposed many methods for mitigating those discriminatory effects. While those methods are effective in mitigating bias, few of them can provide explanations on what is the root cause of bias. Objective: We aim to better detect and mitigate algorithmic discrimination in machine learning software problems. Method: Here we propose ${{\sf FairMask}}$FairMask, a model-based extrapolation method that is capable of both mitigating bias and explaining the cause. In our ${{\sf FairMask}}$FairMask approach, protected attributes are represented by models learned from the other independent variables (and these models offer extrapolations over the space between existing examples). We then use the extrapolation models to relabel protected attributes later seen in testing data or deployment time. Our approach aims to offset the biased predictions of the classification model by rebalancing the distribution of protected attributes. Results: The experiments of this paper show that, without compromising (original) model performance, ${{\sf FairMask}}$FairMask can achieve significantly better group and individual fairness (as measured in different metrics) than benchmark methods. Moreover, compared to another instance-based rebalancing method, our model-based approach shows faster runtime and thus better scalability. Conclusion: Algorithmic decision bias can be removed via extrapolation that corrects the misleading latent correlation between the protected attributes and other non-protected ones. As evidence for this, our proposed ${{\sf FairMask}}$FairMask is not only performance-wise better (measured by fairness and performance metrics) than two state-of-the-art fairness algorithms. Reproduction Package: In order to better support open science, all scripts and data used in this study are available online at https://github.com/anonymous12138/biasmitigation.}, number={4}, journal={IEEE TRANSACTIONS ON SOFTWARE ENGINEERING}, author={Peng, Kewen and Chakraborty, Joymallya and Menzies, Tim}, year={2023}, month={Apr}, pages={2426–2439} } @article{mathew_agrawal_menzies_2023, title={Finding Trends in Software Research}, volume={49}, ISSN={["1939-3520"]}, url={https://doi.org/10.1109/TSE.2018.2870388}, DOI={10.1109/TSE.2018.2870388}, abstractNote={Text mining methods can find large scale trends within research communities. For example, using stable Latent Dirichlet Allocation (a topic modeling algorithm) this study found 10 major topics in 35,391 SE research papers from 34 leading SE venues over the last 25 years (divided, evenly, between conferences and journals). Out study also shows how those topics have changed over recent years. Also, we note that (in the historical record) mono-focusing on a single topic can lead to fewer citations than otherwise. Further, while we find no overall gender bias in SE authorship, we note that women are under-represented in the top-most cited papers in our field. Lastly, we show a previously unreported dichotomy between software conferences and journals (so research topics that succeed at conferences might not succeed at journals, and vice versa). An important aspect of this work is that it is automatic and quickly repeatable (unlike prior SE bibliometric studies that used tediously slow and labor intensive methods). Automation is important since, like any data mining study, its conclusions are skewed by the data used in the analysis. The automatic methods of this paper make it far easier for other researchers to re-apply the analysis to new data, or if they want to use different modeling assumptions.}, number={4}, journal={IEEE TRANSACTIONS ON SOFTWARE ENGINEERING}, author={Mathew, George and Agrawal, Amritanshu and Menzies, Tim}, year={2023}, month={Apr}, pages={1397–1410} } @article{menzies_2023, title={How to "Sell" Ethics (Using AI): An Interview With Alexander Serebrenik}, volume={40}, ISSN={["1937-4194"]}, url={https://doi.org/10.1109/MS.2023.3249539}, DOI={10.1109/MS.2023.3249539}, abstractNote={“Most organizations are tone deaf when it comes to ethics,” says Prof. Alexander Serebrenik (Figure 1) of the Eindhoven University of Technology (https://tue.academia.edu/AlexanderSerebrenik). “I’ve been trying to talk discrimination, diversity, and inclusion with them for years, and frankly, I’ve given up”}, number={3}, journal={IEEE SOFTWARE}, author={Menzies, Tim}, year={2023}, pages={95–97} } @article{yedida_kang_tu_yang_lo_menzies_2023, title={How to Find Actionable Static Analysis Warnings: A Case Study With FindBugs}, volume={49}, ISSN={["1939-3520"]}, url={https://doi.org/10.1109/TSE.2023.3234206}, DOI={10.1109/TSE.2023.3234206}, abstractNote={Automatically generated static code warnings suffer from a large number of false alarms. Hence, developers only take action on a small percent of those warnings. To better predict which static code warnings should not be ignored, we suggest that analysts need to look deeper into their algorithms to find choices that better improve the particulars of their specific problem. Specifically, we show here that effective predictors of such warnings can be created by methods that locally adjust the decision boundary (between actionable warnings and others). These methods yield a new high water-mark for recognizing actionable static code warnings. For eight open-source Java projects (cassandra, jmeter, commons, lucene-solr, maven, ant, tomcat, derby) we achieve perfect test results on 4/8 datasets and, overall, a median AUC (area under the true negatives, true positives curve) of 92%.}, number={4}, journal={IEEE TRANSACTIONS ON SOFTWARE ENGINEERING}, author={Yedida, Rahul and Kang, Hong Jin and Tu, Huy and Yang, Xueqi and Lo, David and Menzies, Tim}, year={2023}, month={Apr}, pages={2856–2872} } @article{menzies_johnson_roberts_alvarez_2023, title={The Engineering Mindset Is an Ethical Mindset (We Just Don't Teach It That Way ... Yet)}, volume={40}, ISSN={["1937-4194"]}, url={https://doi.org/10.1109/MS.2022.3227597}, DOI={10.1109/MS.2022.3227597}, abstractNote={Can achieve an ethical engineering mindset without detracting from the core technical topics of CS subjects? Some doubt that this is possible. Lately we have been involved in too many discussions where participants were worried about how to teach ethics to SE/CS students. The problem, we heard, was too hard and too complex. In this short note, we beg to differ. Ethics is a well-studied issue. Philosophers have studied and taught ethics and civics for generations. Similarly, computer scientists and software engineers have assembled an extensive set of skills and resources which are relevant to an ethical education. Using those resources, this paper offers a proof-by-example of a CS class syllabus that (a) enables an ethical engineering mindset while (b) not detracting from core technical topics.}, number={2}, journal={IEEE SOFTWARE}, author={Menzies, Tim and Johnson, Brittany L. and Roberts, David and Alvarez, Lauren}, year={2023}, pages={103–110} } @article{johnson_menzies_2023, title={Unfairness Is Everywhere, so What to Do? An Interview With Jeanna Matthews}, volume={40}, ISSN={["1937-4194"]}, url={https://doi.org/10.1109/MS.2023.3305722}, DOI={10.1109/MS.2023.3305722}, abstractNote={Usually, when we talk to other software engineers about fairness and discrimination, it quickly becomes a conversation about measurement (e.g., how to check if different populations within society are getting different false-positive rates from that software). But if you talk to Dr. Jeanna Mathews from Clarkson University, the conversation is very different. She focuses on risk as a function of the severity of consequences and the probability of those consequences occurring. Then she asks how legislation could help manage high-risk software.}, number={6}, journal={IEEE SOFTWARE}, author={Johnson, Brittany and Menzies, Tim}, year={2023}, month={Nov}, pages={135–138} } @article{peng_kaltenecker_siegmund_apel_menzies_2023, title={VEER: enhancing the interpretability of model-based optimizations}, volume={28}, ISSN={["1573-7616"]}, DOI={10.1007/s10664-023-10296-w}, abstractNote={Many software systems can be tuned for multiple objectives (e.g., faster runtime, less required memory, less network traffic or energy consumption, etc.). Such systems can suffer from “disagreement” where different models have different (or even opposite) insights and tactics on how to optimize a system. For configuration problems, we show that (a) model disagreement is rampant; yet (b) prior to this paper, it has barely been explored. We aim at helping practitioners and researchers better solve multi-objective configuration optimization problems, by resolving model disagreement. We propose a dimension reduction method called VEER that builds a useful one-dimensional approximation to the original N-objective space. Traditional model-based optimizers use Pareto search to locate Pareto-optimal solutions to a multi-objective problem, which is computationally heavy on large-scale systems. VEER builds a surrogate that can replace the Pareto sorting step after deployment. Compared to the prior state-of-the-art, for 11 configurable systems, VEER significantly reduces disagreement and execution time, without compromising the optimization performance in most cases. For our largest problem (with tens of thousands of possible configurations), optimizing with VEER finds as good or better optimizations with zero model disagreements, three orders of magnitude faster. When employing model-based optimizers for multi-objective optimization, we recommend to apply VEER, which not only improves the execution time, but also resolves the potential model disagreement problem.}, number={3}, journal={EMPIRICAL SOFTWARE ENGINEERING}, author={Peng, Kewen and Kaltenecker, Christian and Siegmund, Norbert and Apel, Sven and Menzies, Tim}, year={2023}, month={Jun} } @article{ling_menzies_2023, title={What Not to Test (For Cyber-Physical Systems)}, volume={49}, ISSN={["1939-3520"]}, url={https://doi.org/10.1109/TSE.2023.3272309}, DOI={10.1109/TSE.2023.3272309}, abstractNote={For simulation-based systems, finding a set of test cases with the least cost by exploring multiple goals is a complex task. Domain-specific optimization goals (e.g., maximize output variance) are useful for guiding the rapid selection of test cases via mutation. But evaluating the selected test cases via mutation (that can distinguish the current program from) is a different goal to domain-specific optimizations. While the optimization goals can be used to guide the mutation analysis, that guidance should be viewed as a weak indicator since it can hurt the mutation effectiveness goals by focusing too much on the optimization goals. Based on the above, this paper proposes DoLesS (Domination with Least Squares Approximation) that selects the minimal and effective test cases by averaging over a coarse-grained grid of the information gained from multiple optimizations goals. DoLesS applies an inverted least squares approximation approach to find a minimal set of tests that can distinguish better from worse parts of the optimization goals. When tested on multiple simulation-based systems, DoLesS performs as well or even better as the prior state-of-the-art, while running 80-360 times faster on average (seconds instead of hours).}, number={7}, journal={IEEE TRANSACTIONS ON SOFTWARE ENGINEERING}, author={Ling, Xiao and Menzies, Tim}, year={2023}, month={Jul}, pages={3811–3826} } @article{mashkoor_menzies_egyed_ramler_2022, title={Artificial Intelligence and Software Engineering: Are We Ready?}, volume={55}, ISSN={["1558-0814"]}, DOI={10.1109/MC.2022.3144805}, abstractNote={Artificial intelligence and software engineering complement each other in various ways. This special issue highlights how this relationship is developing over time to address the challenges faced in modern-day computing.}, number={3}, journal={COMPUTER}, author={Mashkoor, Atif and Menzies, Tim and Egyed, Alexander and Ramler, Rudolf}, year={2022}, month={Mar}, pages={24–28} } @article{yu_carver_rothermel_menzies_2022, title={Assessing expert system-assisted literature reviews with a case study}, volume={200}, ISSN={["1873-6793"]}, DOI={10.1016/j.eswa.2022.116958}, abstractNote={Given the large numbers of publications in software engineering, frequent literature reviews are required to keep current on work in specific areas. One tedious work in literature reviews is to find relevant studies amongst thousands of non-relevant search results. In theory, expert systems can assist in finding relevant work but those systems have primarily been tested in simulations rather than in application to actual literature reviews. Hence, few researchers have faith in such expert systems. Accordingly, using a realistic case study, this paper assesses how well our state-of-the-art expert system can help with literature reviews. The assessed literature review aimed at identifying test case prioritization techniques for automated UI testing, specifically from 8,349 papers on IEEE Xplore. This corpus was studied with an expert system that incorporates an incrementally updated human-in-the-loop active learning tool. Using that expert system, in three hours, we found 242 relevant papers from which we identified 12 techniques representing the state-of-the-art in test case prioritization when source code information is not available. These results were then validated by six other graduate students manually exploring the same corpus. Without the expert system, this task would have required 53 h and would have found 27 additional papers. That is, our expert system achieved 90% recall with 6% of the human effort cost when compared to a conventional manual method. Significantly, the same 12 state-of-the-art test case prioritization techniques were identified by both the expert system and the manual method. That is, the 27 papers missed by the expert system would not have changed the conclusion of the literature review. Hence, if this result generalizes, it endorses the use of our expert system to assist in literature reviews.}, journal={EXPERT SYSTEMS WITH APPLICATIONS}, author={Yu, Zhe and Carver, Jeffrey C. and Rothermel, Gregg and Menzies, Tim}, year={2022}, month={Aug} } @article{tu_yu_menzies_2022, title={Better Data Labelling With EMBLEM (and how that Impacts Defect Prediction)}, volume={48}, ISSN={["1939-3520"]}, url={https://doi.org/10.1109/TSE.2020.2986415}, DOI={10.1109/TSE.2020.2986415}, abstractNote={Standard automatic methods for recognizing problematic development commits can be greatly improved via the incremental application of human+artificial expertise. In this approach, called EMBLEM, an AI tool first explore the software development process to label commits that are most problematic. Humans then apply their expertise to check those labels (perhaps resulting in the AI updating the support vectors within their SVM learner). We recommend this human+AI partnership, for several reasons. When a new domain is encountered, EMBLEM can learn better ways to label which comments refer to real problems. Also, in studies with 9 open source software projects, labelling via EMBLEM's incremental application of human+AI is at least an order of magnitude cheaper than existing methods ($\approx$ eight times). Further, EMBLEM is very effective. For the data sets explored here, EMBLEM better labelling methods significantly improved $P_{opt}20$Popt20 and G-scores performance in nearly all the projects studied here.}, number={1}, journal={IEEE TRANSACTIONS ON SOFTWARE ENGINEERING}, publisher={Institute of Electrical and Electronics Engineers (IEEE)}, author={Tu, Huy and Yu, Zhe and Menzies, Tim}, year={2022}, month={Jan}, pages={278–294} } @article{shu_xia_williams_menzies_2022, title={Dazzle: Using Optimized Generative Adversarial Networks to Address Security Data Class Imbalance Issue}, ISSN={["2160-1852"]}, DOI={10.1145/3524842.3528437}, abstractNote={Background: Machine learning techniques have been widely used and demonstrate promising performance in many software security tasks such as software vulnerability prediction. However, the class ratio within software vulnerability datasets is often highly imbalanced (since the percentage of observed vulnerability is usually very low). Goal: To help security practitioners address software security data class imbalanced issues and further help build better prediction models with resampled datasets. Method: We introduce an approach called Dazzle which is an optimized version of conditional Wasserstein Generative Adversarial Networks with gradient penalty (cWGAN-GP). Dazzle explores the architecture hyperparameters of cWGAN-GP with a novel optimizer called Bayesian Optimization. We use Dazzle to generate minority class samples to resample the original imbalanced training dataset. Results: We evaluate Dazzle with three software security datasets, i.e., Moodle vulnerable files, Ambari bug reports, and JavaScript function code. We show that Dazzle is practical to use and demonstrates promising improvement over existing state-of-the-art oversampling techniques such as SMOTE (e.g., with an average of about 60% improvement rate over SMOTE in recall among all datasets). Conclusion: Based on this study, we would suggest the use of optimized GANs as an alternative method for security vulnerability data class imbalanced issues.}, journal={2022 MINING SOFTWARE REPOSITORIES CONFERENCE (MSR 2022)}, author={Shu, Rui and Xia, Tianpei and Williams, Laurie and Menzies, Tim}, year={2022}, pages={144–155} } @article{peng_menzies_2022, title={Defect Reduction Planning (Using TimeLIME)}, volume={48}, url={https://doi.org/10.1109/TSE.2021.3062968}, DOI={10.1109/TSE.2021.3062968}, abstractNote={Software comes in releases. An implausible change to software is something that has never been changed in prior releases. When planning how to reduce defects, it is better to use plausible changes, i.e., changes with some precedence in the prior releases. To demonstrate these points, this paper compares several defect reduction planning tools. LIME is a local sensitivity analysis tool that can report the fewest changes needed to alter the classification of some code module (e.g., from “defective” to “non-defective”). TimeLIME is a new tool, introduced in this paper, that improves LIME by restricting its plans to just those attributes which change the most within a project. In this study, we compared the performance of LIME and TimeLIME and several other defect reduction planning algorithms. The generated plans were assessed via (a) the similarity scores between the proposed code changes and the real code changes made by developers; and (b) the improvement scores seen within projects that followed the plans. For nine project trails, we found that TimeLIME outperformed all other algorithms (in 8 out of 9 trials). Hence, we strongly recommend using past releases as a source of knowledge for computing fixes for new releases (using TimeLIME). Apart from these specific results, the other lesson from this paper is that our community might be more careful about using off-the-shelf AI tools, without first applying SE knowledge (e.g., that past releases are a good source of knowledge for planning defect reductions). As shown here, once that SE knowledge is applied, this can result in dramatically better reasoning.}, number={7}, journal={IEEE Transactions on Software Engineering}, publisher={Institute of Electrical and Electronics Engineers (IEEE)}, author={Peng, Kewen and Menzies, Tim}, year={2022}, month={Jul}, pages={2510–2525} } @article{elder_zahan_shu_metro_kozarev_menzies_williams_2022, title={Do I really need all this work to find vulnerabilities? An empirical case study comparing vulnerability detection techniques on a Java application}, volume={27}, ISSN={["1573-7616"]}, url={http://dx.doi.org/10.1007/s10664-022-10179-6}, DOI={10.1007/s10664-022-10179-6}, number={6}, journal={EMPIRICAL SOFTWARE ENGINEERING}, publisher={Springer Science and Business Media LLC}, author={Elder, Sarah and Zahan, Nusrat and Shu, Rui and Metro, Monica and Kozarev, Valeri and Menzies, Tim and Williams, Laurie}, year={2022}, month={Nov} } @article{ling_agrawal_menzies_2022, title={How Different is Test Case Prioritization for Open and Closed Source Projects?}, volume={48}, url={https://doi.org/10.1109/TSE.2021.3063220}, DOI={10.1109/TSE.2021.3063220}, abstractNote={Improved test case prioritization means that software developers can detect and fix more software faults sooner than usual. But is there one “best” prioritization algorithm? Or do different kinds of projects deserve special kinds of prioritization? To answer these questions, this article applies nine prioritization schemes to 31 projects that range from (a) highly rated open-source Github projects to (b) computational science software to (c) a closed-source project. We find that prioritization approaches that work best for open-source projects can work worst for the closed-source project (and vice versa). From these experiments, we conclude that (a) it is ill-advised to always apply one prioritization scheme to all projects since (b) prioritization requires tuning to different project types.}, number={7}, journal={IEEE Transactions on Software Engineering}, publisher={Institute of Electrical and Electronics Engineers (IEEE)}, author={Ling, Xiao and Agrawal, Rishabh and Menzies, Tim}, year={2022}, month={Jul}, pages={2526–2540} } @article{yedida_menzies_2022, title={How to Improve Deep Learning for Software Analytics (a case study with code smell detection)}, ISSN={["2160-1852"]}, url={http://www.scopus.com/inward/record.url?eid=2-s2.0-85134010393&partnerID=MN8TOARS}, DOI={10.1145/3524842.3528458}, abstractNote={To reduce technical debt and make code more maintainable, it is important to be able to warn programmers about code smells. State-of-the-art code small detectors use deep learners, usually without exploring alternatives. For example, one promising alternative is GHOST (from TSE'21) that relies on a combination of hyper-parameter optimization of feedforward neural networks and a novel oversampling technique. The prior study from TSE'21 proposing this novel “fuzzy sampling” was somewhat limited in that the method was tested on defect prediction, but nothing else. Like defect prediction, code smell detection datasets have a class imbalance (which motivated “fuzzy sampling”). Hence, in this work we test if fuzzy sampling is useful for code smell detection. The results of this paper show that we can achieve better than state-of-the-art results on code smell detection with fuzzy oversampling. For example, for “feature envy”, we were able to achieve 99+% AUC across all our datasets, and on 8/10 datasets for “misplaced class”. While our specific results refer to code smell detection, they do suggest other lessons for other kinds of analytics. For example: (a) try better preprocessing before trying complex learners (b) include simpler learners as a baseline in software analytics (c) try “fuzzy sampling” as one such baseline. In order to support others trying to reproduce/extend/refute this work, all our code and data is available online at https://github.com/yrahul3910/code-smell-detection.}, journal={2022 MINING SOFTWARE REPOSITORIES CONFERENCE (MSR 2022)}, author={Yedida, Rahul and Menzies, Tim}, year={2022}, pages={156–166} } @article{yu_fahid_tu_menzies_2022, title={Identifying Self-Admitted Technical Debts With Jitterbug: A Two-Step Approach}, volume={48}, ISSN={["1939-3520"]}, url={https://doi.org/10.1109/TSE.2020.3031401}, DOI={10.1109/TSE.2020.3031401}, abstractNote={Keeping track of and managing Self-Admitted Technical Debts (SATDs) are important to maintaining a healthy software project. This requires much time and effort from human experts to identify the SATDs manually. The current automated solutions do not have satisfactory precision and recall in identifying SATDs to fully automate the process. To solve the above problems, we propose a two-step framework called Jitterbug for identifying SATDs. Jitterbug first identifies the “easy to find” SATDs automatically with close to 100 percent precision using a novel pattern recognition technique. Subsequently, machine learning techniques are applied to assist human experts in manually identifying the remaining “hard to find” SATDs with reduced human effort. Our simulation studies on ten software projects show that Jitterbug can identify SATDs more efficiently (with less human effort) than the prior state-of-the-art methods.}, number={5}, journal={IEEE TRANSACTIONS ON SOFTWARE ENGINEERING}, publisher={Institute of Electrical and Electronics Engineers (IEEE)}, author={Yu, Zhe and Fahid, Fahmid Morshed and Tu, Huy and Menzies, Tim}, year={2022}, month={May}, pages={1676–1691} } @article{majumder_xia_krishna_menzies_2022, title={Methods for Stabilizing Models Across Large Samples of Projects (with case studies on Predicting Defect and Project Health)}, ISSN={["2160-1852"]}, DOI={10.1145/3524842.3527934}, abstractNote={Despite decades of research, Software Engineering (SE) lacks widely accepted models (that offer precise quantitative stable predictions) about what factors most influence software quality. This paper provides a promising result showing such stable models can be generated using a new transfer learning framework called “STABILIZER”. Given a tree of recursively clustered projects (using project meta-data), STABILIZER promotes a model upwards if it performs best in the lower clusters (stopping when the promoted model performs worse than the models seen at a lower level). The number of models found by STABILIZER is minimal: one for defect prediction (756 projects) and less than a dozen for project health (1628 projects). Hence, via STABILIZER, it is possible to find a few projects which can be used for transfer learning and make conclusions that hold across hundreds of projects at a time. Further, the models produced in this manner offer predictions that perform as well or better than the prior state-of-the-art. To the best of our knowledge, STABILIZER is order of magnitude faster than the prior state-of-the-art transfer learners which seek to find conclusion stability, and these case studies are the largest demonstration of the generalizability of quantitative predictions of project quality yet reported in the SE literature. In order to support open science, all our scripts and data are online at https://github.com/Anonymous633671/STABILIZER.}, journal={2022 MINING SOFTWARE REPOSITORIES CONFERENCE (MSR 2022)}, author={Majumder, Suvodeep and Xia, Tianpei and Krishna, Rahul and Menzies, Tim}, year={2022}, pages={566–578} } @article{shu_xia_williams_menzies_2022, title={Omni: automated ensemble with unexpected models against adversarial evasion attack}, volume={27}, ISSN={["1573-7616"]}, url={https://doi.org/10.1007/s10664-021-10064-8}, DOI={10.1007/s10664-021-10064-8}, abstractNote={Machine learning-based security detection models have become prevalent in modern malware and intrusion detection systems. However, previous studies show that such models are susceptible to adversarial evasion attacks. In this type of attack, inputs (i.e., adversarial examples) are specially crafted by intelligent malicious adversaries, with the aim of being misclassified by existing state-of-the-art models (e.g., deep neural networks). Once the attackers can fool a classifier to think that a malicious input is actually benign, they can render a machine learning-based malware or intrusion detection system ineffective. To help security practitioners and researchers build a more robust model against non-adaptive, white-box and non-targeted adversarial evasion attacks through the idea of ensemble model. We propose an approach called Omni, the main idea of which is to explore methods that create an ensemble of “unexpected models”; i.e., models whose control hyperparameters have a large distance to the hyperparameters of an adversary’s target model, with which we then make an optimized weighted ensemble prediction. In studies with five types of adversarial evasion attacks (FGSM, BIM, JSMA, DeepFool and Carlini-Wagner) on five security datasets (NSL-KDD, CIC-IDS-2017, CSE-CIC-IDS2018, CICAndMal2017 and the Contagio PDF dataset), we show Omni is a promising approach as a defense strategy against adversarial attacks when compared with other baseline treatments. When employing ensemble defense against adversarial evasion attacks, we suggest to create ensemble with unexpected models that are distant from the attacker’s expected model (i.e., target model) through methods such as hyperparameter optimization.}, number={1}, journal={EMPIRICAL SOFTWARE ENGINEERING}, publisher={Springer Science and Business Media LLC}, author={Shu, Rui and Xia, Tianpei and Williams, Laurie and Menzies, Tim}, year={2022}, month={Jan} } @article{xia_fu_shu_agrawal_menzies_2022, title={Predicting health indicators for open source projects (using hyperparameter optimization)}, volume={27}, ISSN={["1573-7616"]}, url={https://doi.org/10.1007/s10664-022-10171-0}, DOI={10.1007/s10664-022-10171-0}, abstractNote={Software developed on public platform is a source of data that can be used to make predictions about those projects. While the individual developing activity may be random and hard to predict, the developing behavior on project level can be predicted with good accuracy when large groups of developers work together on software projects. To demonstrate this, we use 64,181 months of data from 1,159 GitHub projects to make various predictions about the recent status of those projects (as of April 2020). We find that traditional estimation algorithms make many mistakes. Algorithms like k-nearest neighbors (KNN), support vector regression (SVR), random forest (RFT), linear regression (LNR), and regression trees (CART) have high error rates. But that error rate can be greatly reduced using hyperparameter optimization. To the best of our knowledge, this is the largest study yet conducted, using recent data for predicting multiple health indicators of open-source projects. To facilitate open science (and replications and extensions of this work), all our materials are available online at https://github.com/arennax/Health_Indicator_Prediction .}, number={6}, journal={EMPIRICAL SOFTWARE ENGINEERING}, author={Xia, Tianpei and Fu, Wei and Shu, Rui and Agrawal, Rishabh and Menzies, Tim}, year={2022}, month={Nov} } @article{majumder_mody_menzies_2022, title={Revisiting process versus product metrics: a large scale analysis}, volume={27}, ISSN={["1573-7616"]}, DOI={10.1007/s10664-021-10068-4}, abstractNote={Numerous methods can build predictive models from software data. However, what methods and conclusions should we endorse as we move from analytics in-the-small (dealing with a handful of projects) to analytics in-the-large (dealing with hundreds of projects)? To answer this question, we recheck prior small-scale results (about process versus product metrics for defect prediction and the granularity of metrics) using 722,471 commits from 700 Github projects. We find that some analytics in-the-small conclusions still hold when scaling up to analytics in-the-large. For example, like prior work, we see that process metrics are better predictors for defects than product metrics (best process/product-based learners respectively achieve recalls of 98%/44% and AUCs of 95%/54%, median values). That said, we warn that it is unwise to trust metric importance results from analytics in-the-small studies since those change dramatically when moving to analytics in-the-large. Also, when reasoning in-the-large about hundreds of projects, it is better to use predictions from multiple models (since single model predictions can become confused and exhibit a high variance).}, number={3}, journal={EMPIRICAL SOFTWARE ENGINEERING}, author={Majumder, Suvodeep and Mody, Pranav and Menzies, Tim}, year={2022}, month={May} } @article{xia_shu_shen_menzies_2022, title={Sequential Model Optimization for Software Effort Estimation}, volume={48}, ISSN={["1939-3520"]}, url={https://doi.org/10.1109/TSE.2020.3047072}, DOI={10.1109/TSE.2020.3047072}, abstractNote={Many methods have been proposed to estimate how much effort is required to build and maintain software. Much of that research tries to recommend a single method – an approach that makes the dubious assumption that one method can handle the diversity of software project data. To address this drawback, we apply a configuration technique called “ROME” (Rapid Optimizing Methods for Estimation), which uses sequential model-based optimization (SMO) to find what configuration settings of effort estimation techniques work best for a particular data set. We test this method using data from 1161 traditional waterfall projects and 120 contemporary projects (from GitHub). In terms of magnitude of relative error and standardized accuracy, we find that ROME achieves better performance than the state-of-the-art methods for both traditional waterfall and contemporary projects. In addition, we conclude that we should not recommend one method for estimation. Rather, it is better to search through a wide range of different methods to find what works best for the local data. To the best of our knowledge, this is the largest effort estimation experiment yet attempted and the only one to test its methods on traditional waterfall and contemporary projects.}, number={6}, journal={IEEE TRANSACTIONS ON SOFTWARE ENGINEERING}, publisher={Institute of Electrical and Electronics Engineers (IEEE)}, author={Xia, Tianpei and Shu, Rui and Shen, Xipeng and Menzies, Tim}, year={2022}, month={Jun}, pages={1994–2009} } @article{bencomo_guo_harrison_heyn_menzies_2022, title={The Secret to Better AI and Better Software (Is Requirements Engineering)}, volume={39}, ISSN={["1937-4194"]}, DOI={10.1109/MS.2021.3118099}, abstractNote={Recently, practitioners and researchers met to discuss the role of requirements, and AI and SE. We offer here notes on that fascinating discussion. Also, have you considered writing for this column? This “SE for AI” column publishes commentaries on the growing field of SE for AI. Submissions are welcomed and encouraged (1,000–2,400 words, each figure and table counts as 250 words, try to use fewer than 12 references, and keep the discussion practitioner focused). Please submit your ideas to me at timm@ieee.org.—Tim Menzies}, number={1}, journal={IEEE SOFTWARE}, author={Bencomo, Nelly and Guo, Jin L. C. and Harrison, Rachel and Heyn, Hans-Martin and Menzies, Tim}, year={2022}, month={Jan}, pages={105–110} } @article{shrikanth_nichols_fahid_menzies_2021, title={Assessing practitioner beliefs about software engineering}, volume={26}, ISSN={["1573-7616"]}, DOI={10.1007/s10664-021-09957-5}, abstractNote={Software engineering is a highly dynamic discipline. Hence, as times change, so too might our beliefs about core processes in this field. This paper checks some five beliefs that originated in the past decades that comment on the relationships between (i) developer productivity; (ii) software quality and (iii) years of developer experience. Using data collected from 1,356 developers in the period 1995 to 2006, we found support for only one of the five beliefs titled “Quality entails productivity.” We found no clear support for four other beliefs based on programming languages and software developers. However, from the sporadic evidence of the four other beliefs, we learned that a narrow scope could delude practitioners in misinterpreting certain effects to hold in their day-to-day work. Lastly, through an aggregated view of assessing the five beliefs, we find programming languages act as a confounding factor for developer productivity and software quality. Thus the overall message of this work is that it is both important and possible to revisit old beliefs in software engineering. Researchers and practitioners should routinely retest old beliefs.}, number={4}, journal={EMPIRICAL SOFTWARE ENGINEERING}, author={Shrikanth, N. C. and Nichols, William and Fahid, Fahmid Morshed and Menzies, Tim}, year={2021}, month={Jul} } @article{chakraborty_majumder_menzies_2021, title={Bias in Machine Learning Software: Why? How? What to Do?}, url={https://doi.org/10.1145/3468264.3468537}, DOI={10.1145/3468264.3468537}, abstractNote={Increasingly, software is making autonomous decisions in case of criminal sentencing, approving credit cards, hiring employees, and so on. Some of these decisions show bias and adversely affect certain social groups (e.g. those defined by sex, race, age, marital status). Many prior works on bias mitigation take the following form: change the data or learners in multiple ways, then see if any of that improves fairness. Perhaps a better approach is to postulate root causes of bias and then applying some resolution strategy. This paper postulates that the root causes of bias are the prior decisions that affect- (a) what data was selected and (b) the labels assigned to those examples. Our Fair-SMOTE algorithm removes biased labels; and rebalances internal distributions such that based on sensitive attribute, examples are equal in both positive and negative classes. On testing, it was seen that this method was just as effective at reducing bias as prior approaches. Further, models generated via Fair-SMOTE achieve higher performance (measured in terms of recall and F1) than other state-of-the-art fairness improvement algorithms. To the best of our knowledge, measured in terms of number of analyzed learners and datasets, this study is one of the largest studies on bias mitigation yet presented in the literature.}, journal={PROCEEDINGS OF THE 29TH ACM JOINT MEETING ON EUROPEAN SOFTWARE ENGINEERING CONFERENCE AND SYMPOSIUM ON THE FOUNDATIONS OF SOFTWARE ENGINEERING (ESEC/FSE '21)}, author={Chakraborty, Joymallya and Majumder, Suvodeep and Menzies, Tim}, year={2021}, pages={429–440} } @article{wang_wang_chen_menzies_cui_xie_wang_2021, title={Characterizing Crowds to Better Optimize Worker Recommendation in Crowdsourced Testing}, volume={47}, ISSN={["1939-3520"]}, url={https://doi.org/10.1109/TSE.2019.2918520}, DOI={10.1109/TSE.2019.2918520}, abstractNote={Crowdsourced testing is an emerging trend, in which test tasks are entrusted to the online crowd workers. Typically, a crowdsourced test task aims to detect as many bugs as possible within a limited budget. However not all crowd workers are equally skilled at finding bugs; Inappropriate workers may miss bugs, or report duplicate bugs, while hiring them requires nontrivial budget. Therefore, it is of great value to recommend a set of appropriate crowd workers for a test task so that more software bugs can be detected with fewer workers. This paper first presents a new characterization of crowd workers and characterizes them with testing context, capability, and domain knowledge. Based on the characterization, we then propose Multi-Objective Crowd wOrker recoMmendation approach (MOCOM), which aims at recommending a minimum number of crowd workers who could detect the maximum number of bugs for a crowdsourced testing task. Specifically, MOCOM recommends crowd workers by maximizing the bug detection probability of workers, the relevance with the test task, the diversity of workers, and minimizing the test cost. We experimentally evaluate MOCOM on 532 test tasks, and results show that MOCOM significantly outperforms five commonly-used and state-of-the-art baselines. Furthermore, MOCOM can reduce duplicate reports and recommend workers with high relevance and larger bug detection probability; because of this it can find more bugs with fewer workers.}, number={6}, journal={IEEE TRANSACTIONS ON SOFTWARE ENGINEERING}, publisher={Institute of Electrical and Electronics Engineers (IEEE)}, author={Wang, Junjie and Wang, Song and Chen, Jianfeng and Menzies, Tim and Cui, Qiang and Xie, Miao and Wang, Qing}, year={2021}, month={Jun}, pages={1259–1276} } @article{yang_menzies_2021, title={Documenting Evidence of a Replication of 'Analyze This! 145 Questions for Data Scientists in Software Engineering'}, url={https://doi.org/10.1145/3468264.3477219}, DOI={10.1145/3468264.3477219}, abstractNote={We report here the use of the 145 software engineering questions for data scientists presented in the Microsoft study in a recent FSE~'20 paper by Huijgens et al. The study by Begel et al. was replicated by Huijgens et al.}, journal={PROCEEDINGS OF THE 29TH ACM JOINT MEETING ON EUROPEAN SOFTWARE ENGINEERING CONFERENCE AND SYMPOSIUM ON THE FOUNDATIONS OF SOFTWARE ENGINEERING (ESEC/FSE '21)}, author={Yang, Xueqi and Menzies, Tim}, year={2021}, pages={1602–1602} } @article{yang_menzies_2021, title={Documenting Evidence of a Replication of 'Populating a Release History Database from Version Control and Bug Tracking Systems'}, url={https://doi.org/10.1145/3468264.3477218}, DOI={10.1145/3468264.3477218}, abstractNote={We report here the use of a keyword-based and regular expression-based approach to identify bug-fixing commits by linking commit messages and issue tracker data in a recent FSE '20 paper by Penta et al. in their paper "On the Relationship between Refactoring Actions and Bugs: A Differentiated Replication". The approach replicated is a keyword-based and regular expression-based approach as studied by Fischer et al.}, journal={PROCEEDINGS OF THE 29TH ACM JOINT MEETING ON EUROPEAN SOFTWARE ENGINEERING CONFERENCE AND SYMPOSIUM ON THE FOUNDATIONS OF SOFTWARE ENGINEERING (ESEC/FSE '21)}, author={Yang, Xueqi and Menzies, Tim}, year={2021}, pages={1601–1601} } @article{yang_menzies_2021, title={Documenting Evidence of a Reproduction of Is There A "Golden" Feature Set for Static Warning Identification? - An Experimental Evaluation'}, url={https://doi.org/10.1145/3468264.3477220}, DOI={10.1145/3468264.3477220}, abstractNote={We report here the use of the static analysis dataset generated by FindBugs in a recent EMSE '21 paper by Yang et al. The artifact reproduced is supervised models to perform static analysis based on a golden feature set as studied by Wang et al.}, journal={PROCEEDINGS OF THE 29TH ACM JOINT MEETING ON EUROPEAN SOFTWARE ENGINEERING CONFERENCE AND SYMPOSIUM ON THE FOUNDATIONS OF SOFTWARE ENGINEERING (ESEC/FSE '21)}, author={Yang, Xueqi and Menzies, Tim}, year={2021}, pages={1603–1603} } @article{peng_menzies_2021, title={Documenting Evidence of a Reuse of "'Why Should I Trust You?": Explaining the Predictions of Any Classifier'}, url={https://doi.org/10.1145/3468264.3477217}, DOI={10.1145/3468264.3477217}, abstractNote={We report here the following example of reuse. LIME is a local instance-based explanation generation framework that was originally proposed by Ribeiro et al. in their paper "'Why Should I Trust You?': Explaining the Predictions of Any Classifier". The framework was reused by Peng et al. in their paper "Defect Reduction Planning (using TimeLIME)". The paper used the original implementation of LIME as one of the core components in the proposed framework.}, journal={PROCEEDINGS OF THE 29TH ACM JOINT MEETING ON EUROPEAN SOFTWARE ENGINEERING CONFERENCE AND SYMPOSIUM ON THE FOUNDATIONS OF SOFTWARE ENGINEERING (ESEC/FSE '21)}, author={Peng, Kewen and Menzies, Tim}, year={2021}, pages={1600–1600} } @article{lustosa_menzies_2021, title={Documenting Evidence of a Reuse of 'A Systematic Literature Review of Techniques and Metrics to Reduce the Cost of Mutation Testing'}, url={https://doi.org/10.1145/3468264.3477214}, DOI={10.1145/3468264.3477214}, abstractNote={This submission is a report on the reuse of Pizzoleto et al.'s Systematic Literature Review by Guizzo et al.}, journal={PROCEEDINGS OF THE 29TH ACM JOINT MEETING ON EUROPEAN SOFTWARE ENGINEERING CONFERENCE AND SYMPOSIUM ON THE FOUNDATIONS OF SOFTWARE ENGINEERING (ESEC/FSE '21)}, author={Lustosa, Andre and Menzies, Tim}, year={2021}, pages={1597–1597} } @article{yedida_menzies_2021, title={Documenting Evidence of a Reuse of 'A Systematic Study of the Class Imbalance Problem in Convolutional Neural Networks'}, url={https://doi.org/10.1145/3468264.3477212}, DOI={10.1145/3468264.3477212}, abstractNote={We report here the reuse of oversampling, and modifications to the basic approach, used in a recent TSE ’21 paper by YedidaMenzies. The method reused is the oversampling technique studied by Buda et al. These methods were studied in the SE domain (specifically, for defect prediction), and extended by Yedida & Menzies.}, journal={PROCEEDINGS OF THE 29TH ACM JOINT MEETING ON EUROPEAN SOFTWARE ENGINEERING CONFERENCE AND SYMPOSIUM ON THE FOUNDATIONS OF SOFTWARE ENGINEERING (ESEC/FSE '21)}, publisher={ACM}, author={Yedida, Rahul and Menzies, Tim}, year={2021}, pages={1595–1595} } @article{yedida_menzies_2021, title={Documenting Evidence of a Reuse of 'On the Number of Linear Regions of Deep Neural Networks'}, url={https://doi.org/10.1145/3468264.3477213}, DOI={10.1145/3468264.3477213}, abstractNote={We report here the reuse of theoretical insights from deep learning literature, used in a recent TSE '21 paper by Yedida & Menzies. The artifact replicated is the lower bound on the number of piecewise linear regions in the decision boundary of a feedforward neural network with ReLU activations, as studied by Montufar et al. We document the reuse of Theorem 4 from Montufar et al. by Yedida & Menzies.}, journal={PROCEEDINGS OF THE 29TH ACM JOINT MEETING ON EUROPEAN SOFTWARE ENGINEERING CONFERENCE AND SYMPOSIUM ON THE FOUNDATIONS OF SOFTWARE ENGINEERING (ESEC/FSE '21)}, publisher={ACM}, author={Yedida, Rahul and Menzies, Tim}, year={2021}, pages={1596–1596} } @article{lustosa_menzies_2021, title={Documenting Evidence of a Reuse of 'RefactoringMiner 2.0'}, url={https://doi.org/10.1145/3468264.3477215}, DOI={10.1145/3468264.3477215}, abstractNote={This submission is a report on the reuse of Tsantalis et al.'s Refactoring Miner (RMiner) package by Penta et al.}, journal={PROCEEDINGS OF THE 29TH ACM JOINT MEETING ON EUROPEAN SOFTWARE ENGINEERING CONFERENCE AND SYMPOSIUM ON THE FOUNDATIONS OF SOFTWARE ENGINEERING (ESEC/FSE '21)}, author={Lustosa, Andre and Menzies, Tim}, year={2021}, pages={1598–1598} } @article{peng_menzies_2021, title={Documenting Evidence of a Reuse of 'What is a Feature? A Qualitative Study of Features in Industrial Software Product Lines'}, url={https://doi.org/10.1145/3468264.3477216}, DOI={10.1145/3468264.3477216}, abstractNote={We report here the following example of reuse. The original paper is a prior work about features in product lines by Berger et al. The paper "Dimensions of software configuration: on the configuration context in modern software development" by Siegmund et al. reused definitions and theories about configuration features in the original paper.}, journal={PROCEEDINGS OF THE 29TH ACM JOINT MEETING ON EUROPEAN SOFTWARE ENGINEERING CONFERENCE AND SYMPOSIUM ON THE FOUNDATIONS OF SOFTWARE ENGINEERING (ESEC/FSE '21)}, author={Peng, Kewen and Menzies, Tim}, year={2021}, pages={1599–1599} } @article{shrikanth_majumder_menzies_2021, title={Early Life Cycle Software Defect Prediction. Why? How?}, ISSN={["0270-5257"]}, DOI={10.1109/ICSE43902.2021.00050}, abstractNote={Many researchers assume that, for software analytics, "more data is better." We write to show that, at least for learning defect predictors, this may not be true. To demonstrate this, we analyzed hundreds of popular GitHub projects. These projects ran for 84 months and contained 3,728 commits (median values). Across these projects, most of the defects occur very early in their life cycle. Hence, defect predictors learned from the first 150 commits and four months perform just as well as anything else. This means that, at least for the projects studied here, after the first few months, we need not continually update our defect prediction models. We hope these results inspire other researchers to adopt a "simplicity-first" approach to their work. Some domains require a complex and data-hungry analysis. But before assuming complexity, it is prudent to check the raw data looking for "short cuts" that can simplify the analysis.}, journal={2021 IEEE/ACM 43RD INTERNATIONAL CONFERENCE ON SOFTWARE ENGINEERING (ICSE 2021)}, author={Shrikanth, N. C. and Majumder, Suvodeep and Menzies, Tim}, year={2021}, pages={448–459} } @article{tu_menzies_2021, title={FRUGAL: Unlocking Semi-Supervised Learning for Software Analytics}, DOI={10.1109/ASE51524.2021.9678617}, abstractNote={Standard software analytics often involves having a large amount of data with labels in order to commission models with acceptable performance. However, prior work has shown that such requirements can be expensive, taking several weeks to label thousands of commits, and not always available when traversing new research problems and domains. Unsupervised Learning is a promising direction to learn hidden patterns within unlabelled data, which has only been extensively studied in defect prediction. Nevertheless, unsupervised learning can be ineffective by itself and has not been explored in other domains (e.g., static analysis and issue close time).Motivated by this literature gap and technical limitations, we present FRUGAL, a tuned semi-supervised method that builds on a simple optimization scheme that does not require sophisticated (e.g., deep learners) and expensive (e.g., 100% manually labelled data) methods. FRUGAL optimizes the unsupervised learner’s configurations (via a simple grid search) while validating our design decision of labelling just 2.5% of the data before prediction.As shown by the experiments of this paper FRUGAL outperforms the state-of-the-art adoptable static code warning recognizer and issue closed time predictor, while reducing the cost of labelling by a factor of 40 (from 100% to 2.5%). Hence we assert that FRUGAL can save considerable effort in data labelling especially in validating prior work or researching new problems.Based on this work, we suggest that proponents of complex and expensive methods should always baseline such methods against simpler and cheaper alternatives. For instance, a semi-supervised learner like FRUGAL can serve as a baseline to the state-of-theart software analytics.}, journal={2021 36TH IEEE/ACM INTERNATIONAL CONFERENCE ON AUTOMATED SOFTWARE ENGINEERING ASE 2021}, author={Tu, Huy and Menzies, Tim}, year={2021}, pages={394–406} } @article{shu_xia_chen_williams_menzies_2021, title={How to Better Distinguish Security Bug Reports (Using Dual Hyperparameter Optimization)}, volume={26}, ISSN={["1573-7616"]}, url={https://doi.org/10.1007/s10664-020-09906-8}, DOI={10.1007/s10664-020-09906-8}, number={3}, journal={EMPIRICAL SOFTWARE ENGINEERING}, publisher={Springer Science and Business Media LLC}, author={Shu, Rui and Xia, Tianpei and Chen, Jianfeng and Williams, Laurie and Menzies, Tim}, year={2021}, month={May} } @article{yu_theisen_williams_menzies_2021, title={Improving Vulnerability Inspection Efficiency Using Active Learning}, volume={47}, ISSN={["1939-3520"]}, url={https://doi.org/10.1109/TSE.2019.2949275}, DOI={10.1109/TSE.2019.2949275}, abstractNote={Software engineers can find vulnerabilities with less effort if they are directed towards code that might contain more vulnerabilities. HARMLESS is an incremental support vector machine tool that builds a vulnerability prediction model from the source code inspected to date, then suggests what source code files should be inspected next. In this way, HARMLESS can reduce the time and effort required to achieve some desired level of recall for finding vulnerabilities. The tool also provides feedback on when to stop (at that desired level of recall) while at the same time, correcting human errors by double-checking suspicious files. This paper evaluates HARMLESS on Mozilla Firefox vulnerability data. HARMLESS found 80, 90, 95, 99 percent of the vulnerabilities by inspecting 10, 16, 20, 34 percent of the source code files. When targeting 90, 95, 99 percent recall, HARMLESS could stop after inspecting 23, 30, 47 percent of the source code files. Even when human reviewers fail to identify half of the vulnerabilities (50 percent false negative rate), HARMLESS could detect 96 percent of the missing vulnerabilities by double-checking half of the inspected files. Our results serve to highlight the very steep cost of protecting software from vulnerabilities (in our case study that cost is, for example, the human effort of inspecting 28,750 × 20% = 5,750 source code files to identify 95 percent of the vulnerabilities). While this result could benefit the mission-critical projects where human resources are available for inspecting thousands of source code files, the research challenge for future work is how to further reduce that cost. The conclusion of this paper discusses various ways that goal might be achieved.}, number={11}, journal={IEEE TRANSACTIONS ON SOFTWARE ENGINEERING}, publisher={Institute of Electrical and Electronics Engineers (IEEE)}, author={Yu, Zhe and Theisen, Christopher and Williams, Laurie and Menzies, Tim}, year={2021}, month={Nov}, pages={2401–2420} } @article{yang_chen_yedida_yu_menzies_2021, title={Learning to recognize actionable static code warnings (is intrinsically easy)}, volume={26}, ISSN={["1573-7616"]}, url={https://doi.org/10.1007/s10664-021-09948-6}, DOI={10.1007/s10664-021-09948-6}, abstractNote={Static code warning tools often generate warnings that programmers ignore. Such tools can be made more useful via data mining algorithms that select the “actionable” warnings; i.e. the warnings that are usually not ignored. In this paper, we look for actionable warnings within a sample of 5,675 actionable warnings seen in 31,058 static code warnings from FindBugs. We find that data mining algorithms can find actionable warnings with remarkable ease. Specifically, a range of data mining methods (deep learners, random forests, decision tree learners, and support vector machines) all achieved very good results (recalls and AUC(TRN, TPR) measures usually over 95% and false alarms usually under 5%). Given that all these learners succeeded so easily, it is appropriate to ask if there is something about this task that is inherently easy. We report that while our data sets have up to 58 raw features, those features can be approximated by less than two underlying dimensions. For such intrinsically simple data, many different kinds of learners can generate useful models with similar performance. Based on the above, we conclude that learning to recognize actionable static code warnings is easy, using a wide range of learning algorithms, since the underlying data is intrinsically simple. If we had to pick one particular learner for this task, we would suggest linear SVMs (since, at least in our sample, that learner ran relatively quickly and achieved the best median performance) and we would not recommend deep learning (since this data is intrinsically very simple).}, number={3}, journal={EMPIRICAL SOFTWARE ENGINEERING}, publisher={Springer Science and Business Media LLC}, author={Yang, Xueqi and Chen, Jianfeng and Yedida, Rahul and Yu, Zhe and Menzies, Tim}, year={2021}, month={May} } @article{tu_papadimitriou_kiran_wang_mandal_deelman_menzies_2021, title={Mining Workflows for Anomalous Data Transfers}, ISSN={["2160-1852"]}, DOI={10.1109/MSR52588.2021.00013}, abstractNote={Modern scientific workflows are data-driven and are often executed on distributed, heterogeneous, high-performance computing infrastructures. Anomalies and failures in the work-flow execution cause loss of scientific productivity and inefficient use of the infrastructure. Hence, detecting, diagnosing, and mitigating these anomalies are immensely important for reliable and performant scientific workflows. Since these workflows rely heavily on high-performance network transfers that require strict QoS constraints, accurately detecting anomalous network performance is crucial to ensure reliable and efficient workflow execution. To address this challenge, we have developed X-FLASH, a network anomaly detection tool for faulty TCP workflow transfers. X-FLASH incorporates novel hyperparameter tuning and data mining approaches for improving the performance of the machine learning algorithms to accurately classify the anomalous TCP packets. X-FLASH leverages XGBoost as an ensemble model and couples XGBoost with a sequential optimizer, FLASH, borrowed from search-based Software Engineering to learn the optimal model parameters. X-FLASH found configurations that outperformed the existing approach up to 28%, 29%, and 40% relatively for F-measure, G-score, and recall in less than 30 evaluations. From (1) large improvement and (2) simple tuning, we recommend future research to have additional tuning study as a new standard, at least in the area of scientific workflow anomaly detection.}, journal={2021 IEEE/ACM 18TH INTERNATIONAL CONFERENCE ON MINING SOFTWARE REPOSITORIES (MSR 2021)}, author={Tu, Huy and Papadimitriou, George and Kiran, Mariam and Wang, Cong and Mandal, Anirban and Deelman, Ewa and Menzies, Tim}, year={2021}, pages={1–12} } @article{yedida_menzies_2021, title={On the Value of Oversampling for Deep Learning in Software Defect Prediction}, volume={48}, ISSN={0098-5589 1939-3520 2326-3881}, url={http://dx.doi.org/10.1109/TSE.2021.3079841}, DOI={10.1109/TSE.2021.3079841}, abstractNote={One truism of deep learning is that the automatic feature engineering (seen in the first layers of those networks) excuses data scientists from performing tedious manual feature engineering prior to running DL. For the specific case of deep learning for defect prediction, we show that that truism is false. Specifically, when we pre-process data with a novel oversampling technique called fuzzy sampling, as part of a larger pipeline called GHOST (Goal-oriented Hyper-parameter Optimization for Scalable Training), then we can do significantly better than the prior DL state of the art in 14/20 defect data sets. Our approach yields state-of-the-art results significantly faster deep learners. These results present a cogent case for the use of oversampling prior to applying deep learning on software defect prediction datasets.}, number={8}, journal={IEEE Transactions on Software Engineering}, publisher={Institute of Electrical and Electronics Engineers (IEEE)}, author={Yedida, Rahul and Menzies, Tim}, year={2021}, pages={1–1} } @article{menzies_2021, title={Shockingly Simple: "Keys" for Better AI for SE}, volume={38}, ISSN={["1937-4194"]}, url={https://doi.org/10.1109/MS.2020.3043014}, DOI={10.1109/MS.2020.3043014}, abstractNote={As 2020 drew to a close, I was thinking about what lessons we have learned about software engineering (SE) for artificial intelligence (AI)-things that we can believe now but, in the last century, would have seemed somewhat shocking. One very surprising lesson, at least for me, is the success of the very complex and very simple. At the complex end, there is now much evidence for the value of deep learners for high-dimensional software engineering problems. For example, consider signal processing for autonomous cars. When reasoning over (say) 10,000 wavelets collected from a vision system, then deep learning can automate much of the engineering required to cover all those data.}, number={2}, journal={IEEE SOFTWARE}, publisher={Institute of Electrical and Electronics Engineers (IEEE)}, author={Menzies, Tim}, year={2021}, pages={114–118} } @article{agrawal_yang_agrawal_yedida_shen_menzies_2021, title={Simpler Hyperparameter Optimization for Software Analytics: Why, How, When}, volume={48}, ISSN={0098-5589 1939-3520 2326-3881}, url={http://dx.doi.org/10.1109/TSE.2021.3073242}, DOI={10.1109/TSE.2021.3073242}, abstractNote={How can we make software analytics simpler and faster? One method is to match the complexity of analysis to the intrinsic complexity of the data being explored. For example, hyperparameter optimizers find the control settings for data miners that improve the predictions generated via software analytics. Sometimes, very fast hyperparameter optimization can be achieved by “DODGE-ing”; i.e., simply steering way from settings that lead to similar conclusions. But when is it wise to use that simple approach and when must we use more complex (and much slower) optimizers? To answer this, we applied hyperparameter optimization to 120 SE data sets that explored bad smell detection, predicting Github issue close time, bug report analysis, defect prediction, and dozens of other non-SE problems. We find that the simple DODGE works best for data sets with low “intrinsic dimensionality” ($\mu _D\approx 3$μD3) and very poorly for higher-dimensional data ($\mu _D > 8$μD>8). Nearly all the SE data seen here was intrinsically low-dimensional, indicating that DODGE is applicable for many SE analytics tasks.}, number={8}, journal={IEEE Transactions on Software Engineering}, publisher={Institute of Electrical and Electronics Engineers (IEEE)}, author={Agrawal, Amritanshu and Yang, Xueqi and Agrawal, Rishabh and Yedida, Rahul and Shen, Xipeng and Menzies, Tim}, year={2021}, pages={1–1} } @article{elder_zahan_kozarev_shu_menzies_williams_2021, title={Structuring a Comprehensive Software Security Course Around the OWASP Application Security Verification Standard}, url={http://dx.doi.org/10.1109/icse-seet52601.2021.00019}, DOI={10.1109/ICSE-SEET52601.2021.00019}, abstractNote={Lack of security expertise among software practitioners is a problem with many implications. First, there is a deficit of security professionals to meet current needs. Additionally, even practitioners who do not plan to work in security may benefit from increased understanding of security. The goal of this paper is to aid software engineering educators in designing a comprehensive software security course by sharing an experience running a software security course for the eleventh time. Through all the eleven years of running the software security course, the course objectives have been comprehensive - ranging from security testing, to secure design and coding, to security requirements to security risk management. For the first time in this eleventh year, a theme of the course assignments was to map vulnerability discovery to the security controls of the Open Web Application Security Project (OWASP) Application Security Verification Standard (ASVS). Based upon student performance on a final exploratory penetration testing project, this mapping may have increased students' depth of understanding of a wider range of security topics. The students efficiently detected 191 unique and verified vulnerabilities of 28 different Common Weakness Enumeration (CWE) types during a three-hour period in the OpenMRS project, an electronic health record application in active use.}, journal={2021 IEEE/ACM 43RD INTERNATIONAL CONFERENCE ON SOFTWARE ENGINEERING: JOINT TRACK ON SOFTWARE ENGINEERING EDUCATION AND TRAINING (ICSE-JSEET 2021)}, publisher={IEEE}, author={Elder, Sarah E. and Zahan, Nusrat and Kozarev, Val and Shu, Rui and Menzies, Tim and Williams, Laurie}, year={2021}, pages={95–104} } @article{yang_yu_wang_menzies_2021, title={Understanding static code warnings: An incremental AI approach}, volume={167}, ISSN={["1873-6793"]}, url={https://doi.org/10.1016/j.eswa.2020.114134}, DOI={10.1016/j.eswa.2020.114134}, abstractNote={Knowledge-based systems reason over some knowledge base. Hence, an important issue for such systems is how to acquire the knowledge needed for their inference. This paper assesses active learning methods for acquiring knowledge for “static code warnings”. Static code analysis is a widely-used method for detecting bugs and security vulnerabilities in software systems. As software becomes more complex, analysis tools also report lists of increasingly complex warnings that developers need to address on a daily basis. Such static code analysis tools are usually over-cautious; i.e. they often offer many warnings about spurious issues. Previous research work shows that about 35% to 91 % warnings reported as bugs by SA tools are actually unactionable (i.e., warnings that would not be acted on by developers because they are falsely suggested as bugs). Experienced developers know which errors are important and which can be safely ignored. How can we capture that experience? This paper reports on an incremental AI tool that watches humans reading false alarm reports. Using an incremental support vector machine mechanism, this AI tool can quickly learn to distinguish spurious false alarms from more serious matters that deserve further attention. In this work, nine open-source projects are employed to evaluate our proposed model on the features extracted by previous researchers and identify the actionable warnings in a priority order given by our algorithm. We observe that our model can identify over 90% of actionable warnings when our methods tell humans to ignore 70 to 80% of the warnings.}, journal={EXPERT SYSTEMS WITH APPLICATIONS}, author={Yang, Xueqi and Yu, Zhe and Wang, Junjie and Menzies, Tim}, year={2021}, month={Apr} } @article{krishna_nair_jamshidi_menzies_2021, title={Whence to Learn? Transferring Knowledge in Configurable Systems Using BEETLE}, volume={47}, ISSN={["1939-3520"]}, url={https://doi.org/10.1109/TSE.2020.2983927}, DOI={10.1109/TSE.2020.2983927}, abstractNote={As software systems grow in complexity and the space of possible configurations increases exponentially, finding the near-optimal configuration of a software system becomes challenging. Recent approaches address this challenge by learning performance models based on a sample set of configurations. However, collecting enough sample configurations can be very expensive since each such sample requires configuring, compiling, and executing the entire system using a complex test suite. When learning on new data is too expensive, it is possible to use Transfer Learning to “transfer” old lessons to the new context. Traditional transfer learning has a number of challenges, specifically, (a) learning from excessive data takes excessive time, and (b) the performance of the models built via transfer can deteriorate as a result of learning from a poor source. To resolve these problems, we propose a novel transfer learning framework called BEETLE, which is a “bellwether”-based transfer learner that focuses on identifying and learning from the most relevant source from amongst the old data. This paper evaluates BEETLE with 57 different software configuration problems based on five software systems (a video encoder, an SAT solver, a SQL database, a high-performance C-compiler, and a streaming data analytics tool). In each of these cases, BEETLE found configurations that are as good as or better than those found by other state-of-the-art transfer learners while requiring only a fraction ($\frac{1}{7}$17th) of the measurements needed by those other methods. Based on these results, we say that BEETLE is a new high-water mark in optimally configuring software.}, number={12}, journal={IEEE TRANSACTIONS ON SOFTWARE ENGINEERING}, publisher={Institute of Electrical and Electronics Engineers (IEEE)}, author={Krishna, Rahul and Nair, Vivek and Jamshidi, Pooyan and Menzies, Tim}, year={2021}, month={Dec}, pages={2956–2972} } @article{shrikanth_menzies_2020, title={Assessing Practitioner Beliefs about Software Defect Prediction}, ISBN={["978-1-4503-7123-0"]}, DOI={10.1145/3377813.3381367}, abstractNote={Just because software developers say they believe in “X”, that does not necessarily mean that “X” is true. As shown here, there exist numerous beliefs listed in the recent Software Engineering literature which are only supported by small portions of the available data. Hence we ask what is the source of this disconnect between beliefs and evidence?. To answer this question we look for evidence for ten beliefs within 300,000+ changes seen in dozens of open-source projects. Some of those beliefs had strong support across all the projects; specifically, A commit that involves more added and removed lines is more bug-prone” and “Files with fewer lines contributed by their owners (who contribute most changes) are bug-prone”. Most of the widely-held beliefs studied are only sporadically supported in the data; i.e. large effects can appear in project data and then disappear in subsequent releases. Such sporadic support explains why developers believe things that were relevant to their prior work, but not necessarily their current work. Our conclusion will be that we need to change the nature of the debate with Software Engineering. Specifically, while it is important to report the effects that hold right now, it is also important to report on what effects change over time.CCS CONCEPTS• Software and its engineering → Maintaining software.}, journal={2020 IEEE/ACM 42ND INTERNATIONAL CONFERENCE ON SOFTWARE ENGINEERING: SOFTWARE ENGINEERING IN PRACTICE (ICSE-SEIP)}, author={Shrikanth, N. C. and Menzies, Tim}, year={2020}, pages={182–190} } @article{agrawal_menzies_minku_wagner_yu_2020, title={Better software analytics via "DUO": Data mining algorithms using/used-by optimizers}, volume={25}, ISSN={["1573-7616"]}, url={https://doi.org/10.1007/s10664-020-09808-9}, DOI={10.1007/s10664-020-09808-9}, abstractNote={This paper claims that a new field of empirical software engineering research and practice is emerging: data mining using/used-by optimizers for empirical studies, or DUO. For example, data miners can generate models that are explored by optimizers. Also, optimizers can advise how to best adjust the control parameters of a data miner. This combined approach acts like an agent leaning over the shoulder of an analyst that advises "ask this question next" or "ignore that problem, it is not relevant to your goals". Further, those agents can help us build "better" predictive models, where "better" can be either greater predictive accuracy or faster modeling time (which, in turn, enables the exploration of a wider range of options). We also caution that the era of papers that just use data miners is coming to an end. Results obtained from an unoptimized data miner can be quickly refuted, just by applying an optimizer to produce a different (and better performing) model. Our conclusion, hence, is that for software analytics it is possible, useful and necessary to combine data mining and optimization using DUO.}, number={3}, journal={EMPIRICAL SOFTWARE ENGINEERING}, publisher={Springer Science and Business Media LLC}, author={Agrawal, Amritanshu and Menzies, Tim and Minku, Leandro L. and Wagner, Markus and Yu, Zhe}, year={2020}, month={May}, pages={2099–2136} } @article{carleton_harper_lyu_eldh_xie_menzies_2020, title={Expert Perspectives on AI}, volume={37}, ISSN={["1937-4194"]}, DOI={10.1109/MS.2020.2987673}, abstractNote={IEEE Software: With the rapid changes occurring in the fields of artificial intelligence (AI) and machine learning (ML), what areas do you think are the most important to focus on right now, especially in relation to software engineering?}, number={4}, journal={IEEE SOFTWARE}, author={Carleton, Anita D. and Harper, Erin and Lyu, Michael R. and Eldh, Sigrid and Xie, Tao and Menzies, Tim}, year={2020}, pages={87–94} } @article{nair_yu_menzies_siegmund_apel_2020, title={Finding Faster Configurations Using FLASH}, volume={46}, url={https://doi.org/10.1109/TSE.2018.2870895}, DOI={10.1109/TSE.2018.2870895}, abstractNote={Finding good configurations of a software system is often challenging since the number of configuration options can be large. Software engineers often make poor choices about configuration or, even worse, they usually use a sub-optimal configuration in production, which leads to inadequate performance. To assist engineers in finding the better configuration, this article introduces Flash, a sequential model-based method that sequentially explores the configuration space by reflecting on the configurations evaluated so far to determine the next best configuration to explore. Flash scales up to software systems that defeat the prior state-of-the-art model-based methods in this area. Flash runs much faster than existing methods and can solve both single-objective and multi-objective optimization problems. The central insight of this article is to use the prior knowledge of the configuration space (gained from prior runs) to choose the next promising configuration. This strategy reduces the effort (i.e., number of measurements) required to find the better configuration. We evaluate Flash using 30 scenarios based on 7 software systems to demonstrate that Flash saves effort in 100 and 80 percent of cases in single-objective and multi-objective problems respectively by up to several orders of magnitude compared to state-of-the-art techniques.}, number={7}, journal={IEEE Transactions on Software Engineering}, publisher={Institute of Electrical and Electronics Engineers (IEEE)}, author={Nair, Vivek and Yu, Zhe and Menzies, Tim and Siegmund, Norbert and Apel, Sven}, year={2020}, month={Jul}, pages={794–811} } @article{krishna_menzies_2020, title={Learning actionable analytics from multiple software projects}, volume={25}, ISSN={["1573-7616"]}, DOI={10.1007/s10664-020-09843-6}, abstractNote={The current generation of software analytics tools are mostly prediction algorithms (e.g. support vector machines, naive bayes, logistic regression, etc). While prediction is useful, after prediction comes planning about what actions to take in order to improve quality. This research seeks methods that generate demonstrably useful guidance on “what to do” within the context of a specific software project. Specifically, we propose XTREE (for within-project planning) and BELLTREE (for cross-project planning) to generating plans that can improve software quality. Each such plan has the property that, if followed, it reduces the expected number of future defect reports. To find this expected number, planning was first applied to data from release x. Next, we looked for change in release x + 1 that conformed to our plans. This procedure was applied using a range of planners from the literature, as well as XTREE. In 10 open-source JAVA systems, several hundreds of defects were reduced in sections of the code that conformed to XTREE’s plans. Further, when compared to other planners, XTREE’s plans were found to be easier to implement (since they were shorter) and more effective at reducing the expected number of defects.}, number={5}, journal={EMPIRICAL SOFTWARE ENGINEERING}, author={Krishna, Rahul and Menzies, Tim}, year={2020}, month={Sep}, pages={3468–3500} } @article{chakraborty_peng_menzies_2020, title={Making Fair ML Software using Trustworthy Explanation}, ISSN={["1527-1366"]}, DOI={10.1145/3324884.3418932}, abstractNote={Machine learning software is being used in many applications (finance, hiring, admissions, criminal justice) having huge social impact. But sometimes the behavior of this software is biased and it shows discrimination based on some sensitive attributes such as sex, race etc. Prior works concentrated on finding and mitigating bias in ML models. A recent trend is using instance-based model-agnostic explanation methods such as LIME[36] to find out bias in the model prediction. Our work concentrates on finding shortcomings of current bias measures and explanation methods. We show how our proposed method based on K nearest neighbors can overcome those shortcomings and find the underlying bias of black box models. Our results are more trustworthy and helpful for the practitioners. Finally, We describe our future framework combining explanation and planning to build fair software.}, journal={2020 35TH IEEE/ACM INTERNATIONAL CONFERENCE ON AUTOMATED SOFTWARE ENGINEERING (ASE 2020)}, author={Chakraborty, Joymallya and Peng, Kewen and Menzies, Tim}, year={2020}, pages={1229–1233} } @article{carleton_harper_menzies_xie_eldh_lyu_2020, title={The AI Effect: Working at the Intersection of AI and SE}, volume={37}, ISSN={["1937-4194"]}, DOI={10.1109/MS.2020.2987666}, abstractNote={This special issue explores the intersection of artificial intelligence (AI) and software engineering (SE), that is, what can AI do for SE, and how can we as software engineers design and build better AI systems?}, number={4}, journal={IEEE SOFTWARE}, author={Carleton, Anita D. and Harper, Erin and Menzies, Tim and Xie, Tao and Eldh, Sigrid and Lyu, Michael R.}, year={2020}, pages={26–35} } @article{menzies_2020, title={The Five Laws of SE for AI}, volume={37}, ISSN={["1937-4194"]}, DOI={10.1109/MS.2019.2954841}, abstractNote={It is time to talk about software engineering (SE) for artificial intelligence (AI). As shown in Figure 1, industry is becoming increasingly dependent on AI software. Clearly, AI is useful for SE. But what about the other way around? How important is SE for AI? Many thought leaders in the AI industry are asking how to better develop and maintain AI software (see Figure 2).}, number={1}, journal={IEEE SOFTWARE}, author={Menzies, Tim}, year={2020}, pages={81–85} } @article{shrikanth_menzies_2020, title={What disconnects Practitioner Belief and Empirical Evidence ?}, ISSN={["0270-5257"]}, DOI={10.1145/3377812.3390802}, abstractNote={Just because software developers say they believe in "X", that does not necessarily mean that "X" is true. As shown here, there exist numerous beliefs listed in the recent Software Engineering literature which are only supported by small portions of the available data. Hence we ask what is the source of this disconnect between beliefs and evidence?.To answer this question we look for evidence for ten beliefs within 300,000+ changes seen in dozens of open-source projects. Some of those beliefs had strong support across all the projects; specifically, "A commit that involves more added and removed lines is more bug-prone" and "Files with fewer lines contributed by their owners (who contribute most changes) are bug-prone".Most of the widely-held beliefs studied are only sporadically supported in the data; i.e. large effects can appear in project data and then disappear in subsequent releases. Such sporadic support explains why developers believe things that were relevant to their prior work, but not necessarily their current work.}, journal={2020 ACM/IEEE 42ND INTERNATIONAL CONFERENCE ON SOFTWARE ENGINEERING: COMPANION PROCEEDINGS (ICSE-COMPANION 2020)}, author={Shrikanth, N. C. and Menzies, Tim}, year={2020}, pages={286–287} } @article{wang_yang_menzies_wang_2020, title={iSENSE2.0: Improving Completion-aware Crowdtesting Management with Duplicate Tagger and Sanity Checker}, volume={29}, ISSN={["1557-7392"]}, url={https://doi.org/10.1145/3394602}, DOI={10.1145/3394602}, abstractNote={Software engineers get questions of “how much testing is enough” on a regular basis. Existing approaches in software testing management employ experience-, risk-, or value-based analysis to prioritize and manage testing processes. However, very few is applicable to the emerging crowdtesting paradigm to cope with extremely limited information and control over unknown, online crowdworkers. In practice, deciding when to close a crowdtesting task is largely done by experience-based guesswork and frequently results in ineffective crowdtesting. More specifically, it is found that an average of 32% testing cost was wasteful spending in current crowdtesting practice. This article intends to address this challenge by introducing automated decision support for monitoring and determining appropriate time to close crowdtesting tasks.}, number={4}, journal={ACM TRANSACTIONS ON SOFTWARE ENGINEERING AND METHODOLOGY}, author={Wang, Junjie and Yang, Ye and Menzies, Tim and Wang, Qing}, year={2020}, month={Oct} } @article{menzies_shepperd_2019, title={"Bad smells" in software analytics papers}, volume={112}, ISSN={["1873-6025"]}, url={https://doi.org/10.1016/j.infsof.2019.04.005}, DOI={10.1016/j.infsof.2019.04.005}, abstractNote={There has been a rapid growth in the use of data analytics to underpin evidence-based software engineering. However the combination of complex techniques, diverse reporting standards and poorly understood underlying phenomena are causing some concern as to the reliability of studies. Our goal is to provide guidance for producers and consumers of software analytics studies (computational experiments and correlation studies). We propose using “bad smells”, i.e., surface indications of deeper problems and popular in the agile software community and consider how they may be manifest in software analytics studies. We list 12 “bad smells” in software analytics papers (and show their impact by examples). We believe the metaphor of bad smell is a useful device. Therefore we encourage more debate on what contributes to the validity of software analytics studies (so we expect our list will mature over time).}, journal={INFORMATION AND SOFTWARE TECHNOLOGY}, publisher={Elsevier BV}, author={Menzies, Tim and Shepperd, Martin}, year={2019}, month={Aug}, pages={35–47} } @article{chen_nair_krishna_menzies_2019, title={"Sampling" as a Baseline Optimizer for Search-Based Software Engineering}, volume={45}, ISSN={["1939-3520"]}, url={https://doi.org/10.1109/TSE.2018.2790925}, DOI={10.1109/TSE.2018.2790925}, abstractNote={Increasingly, Software Engineering (SE) researchers use search-based optimization techniques to solve SE problems with multiple conflicting objectives. These techniques often apply CPU-intensive evolutionary algorithms to explore generations of mutations to a population of candidate solutions. An alternative approach, proposed in this paper, is to start with a very large population and sample down to just the better solutions. We call this method “Sway”, short for “the sampling way”. This paper compares Sway versus state-of-the-art search-based SE tools using seven models: five software product line models; and two other software process control models (concerned with project management, effort estimation, and selection of requirements) during incremental agile development. For these models, the experiments of this paper show that Sway is competitive with corresponding state-of-the-art evolutionary algorithms while requiring orders of magnitude fewer evaluations. Considering the simplicity and effectiveness of Sway, we, therefore, propose this approach as a baseline method for search-based software engineering models, especially for models that are very slow to execute.}, number={6}, journal={IEEE TRANSACTIONS ON SOFTWARE ENGINEERING}, publisher={Institute of Electrical and Electronics Engineers (IEEE)}, author={Chen, Jianfeng and Nair, Vivek and Krishna, Rahul and Menzies, Tim}, year={2019}, month={Jun}, pages={597–614} } @article{choetkiertikul_dam_tran_pham_ghose_menzies_2019, title={A Deep Learning Model for Estimating Story Points}, volume={45}, ISSN={["1939-3520"]}, url={https://doi.org/10.1109/TSE.2018.2792473}, DOI={10.1109/TSE.2018.2792473}, abstractNote={Although there has been substantial research in software analytics for effort estimation in traditional software projects, little work has been done for estimation in agile projects, especially estimating the effort required for completing user stories or issues. Story points are the most common unit of measure used for estimating the effort involved in completing a user story or resolving an issue. In this paper, we propose a prediction model for estimating story points based on a novel combination of two powerful deep learning architectures: long short-term memory and recurrent highway network. Our prediction system is end-to-end trainable from raw input data to prediction outcomes without any manual feature engineering. We offer a comprehensive dataset for story points-based estimation that contains 23,313 issues from 16 open source projects. An empirical evaluation demonstrates that our approach consistently outperforms three common baselines (Random Guessing, Mean, and Median methods) and six alternatives (e.g., using Doc2Vec and Random Forests) in Mean Absolute Error, Median Absolute Error, and the Standardized Accuracy.}, number={7}, journal={IEEE TRANSACTIONS ON SOFTWARE ENGINEERING}, publisher={Institute of Electrical and Electronics Engineers (IEEE)}, author={Choetkiertikul, Morakot and Dam, Hoa Khanh and Tran, Truyen and Pham, Trang and Ghose, Aditya and Menzies, Tim}, year={2019}, month={Jul}, pages={637–656} } @article{krishna_menzies_2019, title={Bellwethers: A Baseline Method for Transfer Learning}, volume={45}, ISSN={["1939-3520"]}, url={https://doi.org/10.1109/TSE.2018.2821670}, DOI={10.1109/TSE.2018.2821670}, abstractNote={Software analytics builds quality prediction models for software projects. Experience shows that (a) the more projects studied, the more varied are the conclusions; and (b) project managers lose faith in the results of software analytics if those results keep changing. To reduce this conclusion instability, we propose the use of “bellwethers”: given N projects from a community the bellwether is the project whose data yields the best predictions on all others. The bellwethers offer a way to mitigate conclusion instability because conclusions about a community are stable as long as this bellwether continues as the best oracle. Bellwethers are also simple to discover (just wrap a for-loop around standard data miners). When compared to other transfer learning methods (TCA+, transfer Naive Bayes, value cognitive boosting), using just the bellwether data to construct a simple transfer learner yields comparable predictions. Further, bellwethers appear in many SE tasks such as defect prediction, effort estimation, and bad smell detection. We hence recommend using bellwethers as a baseline method for transfer learning against which future work should be compared.}, number={11}, journal={IEEE TRANSACTIONS ON SOFTWARE ENGINEERING}, publisher={Institute of Electrical and Electronics Engineers (IEEE)}, author={Krishna, Rahul and Menzies, Tim}, year={2019}, month={Nov}, pages={1081–1105} } @article{yu_menzies_2019, title={FAST(2): An intelligent assistant for finding relevant papers}, volume={120}, ISSN={["1873-6793"]}, url={https://doi.org/10.1016/j.eswa.2018.11.021}, DOI={10.1016/j.eswa.2018.11.021}, abstractNote={Literature reviews are essential for any researcher trying to keep up to date with the burgeoning software engineering literature. FAST$^2$ is a novel tool for reducing the effort required for conducting literature reviews by assisting the researchers to find the next promising paper to read (among a set of unread papers). This paper describes FAST$^2$ and tests it on four large software engineering literature reviews conducted by Wahono (2015), Hall (2012), Radjenovi\'c (2013) and Kitchenham (2017). We find that FAST$^2$ is a faster and robust tool to assist researcher finding relevant SE papers which can compensate for the errors made by humans during the review process. The effectiveness of FAST$^2$ can be attributed to three key innovations: (1) a novel way of applying external domain knowledge (a simple two or three keyword search) to guide the initial selection of papers---which helps to find relevant research papers faster with less variances; (2) an estimator of the number of remaining relevant papers yet to be found---which in practical settings can be used to decide if the reviewing process needs to be terminated; (3) a novel self-correcting classification algorithm---automatically corrects itself, in cases where the researcher wrongly classifies a paper.}, journal={EXPERT SYSTEMS WITH APPLICATIONS}, publisher={Elsevier BV}, author={Yu, Zhe and Menzies, Tim}, year={2019}, month={Apr}, pages={57–71} } @article{agrawal_fu_chen_shen_menzies_2019, title={How to "DODGE" Complex Software Analytics}, volume={47}, ISSN={0098-5589 1939-3520 2326-3881}, url={http://dx.doi.org/10.1109/tse.2019.2945020}, DOI={10.1109/TSE.2019.2945020}, abstractNote={Machine learning techniques applied to software engineering tasks can be improved by hyperparameter optimization, i.e., automatic tools that find good settings for a learner's control parameters. We show that such hyperparameter optimization can be unnecessarily slow, particularly when the optimizers waste time exploring “redundant tunings”, i.e., pairs of tunings which lead to indistinguishable results. By ignoring redundant tunings, DODGE($\mathcal {E})$E), a tuning tool, runs orders of magnitude faster, while also generating learners with more accurate predictions than seen in prior state-of-the-art approaches.}, number={10}, journal={IEEE Transactions on Software Engineering}, publisher={Institute of Electrical and Electronics Engineers (IEEE)}, author={Agrawal, Amritanshu and Fu, Wei and Chen, Di and Shen, Xipeng and Menzies, Tim}, year={2019}, pages={1–1} } @article{wang_li_wang_menzies_wang_2019, title={Images don't lie: Duplicate crowdtesting reports detection with screenshot information}, volume={110}, ISSN={["1873-6025"]}, url={https://doi.org/10.1016/j.infsof.2019.03.003}, DOI={10.1016/j.infsof.2019.03.003}, abstractNote={Crowdtesting is effective especially when it comes to the feedback on GUI systems, or subjective opinions about features. Despite of this, we find crowdtesting reports are highly replicated, i.e., 82% of them are replicates of others. Hence automatically detecting replicate reports could help reduce triaging efforts. Most of the existing approaches mainly adopted textual information for replicate detection, and suffered from low accuracy because of the expression gap. Our observation on real industrial crowdtesting data found that when dealing with crowdtesting reports of GUI systems, the reports would accompanied with images, i.e., the screenshots of the app. We assume the screenshot to be valuable for replicate crowdtesting report detection because it reflects the real scenario of the failure and is not affected by the variety of natural languages. In this work, we propose a replicate detection approach, TSDetector, which combines information from the screenshots and the textual descriptions to detect replicate crowdtesting reports. We extract four types of features to characterize the screenshots and the textual descriptions, and design an algorithm to detect replicates based on four similarity scores derived from the four different features respectively. We investigate the effectiveness and advantage of TSDetector on 15 commercial projects with 4,172 reports from one of the Chinese largest crowdtesting platforms.Results show that TSDetector can outperform existing state-of-the-art approaches significantly. In addition, we also evaluate its usefulness using real-world case studies. The feedback from real-world testers demonstrates its practical value}, journal={INFORMATION AND SOFTWARE TECHNOLOGY}, publisher={Elsevier BV}, author={Wang, Junjie and Li, Mingyang and Wang, Song and Menzies, Tim and Wang, Qing}, year={2019}, month={Jun}, pages={139–155} } @article{chen_chakraborty_clark_haverlock_cherian_menzies_2019, title={Predicting Breakdowns in Cloud Services (with SPIKE)}, DOI={10.1145/3338906.3340450}, abstractNote={Maintaining web-services is a mission-critical task where any down- time means loss of revenue and reputation (of being a reliable service provider). In the current competitive web services market, such a loss of reputation causes extensive loss of future revenue. To address this issue, we developed SPIKE, a data mining tool which can predict upcoming service breakdowns, half an hour into the future. Such predictions let an organization alert and assemble the tiger team to address the problem (e.g. by reconguring cloud hardware in order to reduce the likelihood of that breakdown). SPIKE utilizes (a) regression tree learning (with CART); (b) synthetic minority over-sampling (to handle how rare spikes are in our data); (c) hyperparameter optimization (to learn best settings for our local data) and (d) a technique we called “topology sampling” where training vectors are built from extensive details of an individual node plus summary details on all their neighbors. In the experiments reported here, SPIKE predicted service spikes 30 minutes into future with recalls and precision of 75% and above. Also, SPIKE performed relatively better than other widely-used learning methods (neural nets, random forests, logistic regression).}, journal={ESEC/FSE'2019: PROCEEDINGS OF THE 2019 27TH ACM JOINT MEETING ON EUROPEAN SOFTWARE ENGINEERING CONFERENCE AND SYMPOSIUM ON THE FOUNDATIONS OF SOFTWARE ENGINEERING}, author={Chen, Jianfeng and Chakraborty, Joymallya and Clark, Philip and Haverlock, Kevin and Cherian, Snehit and Menzies, Tim}, year={2019}, pages={916–924} } @article{yu_fahid_menzies_rothermel_patrick_cherian_2019, title={TERMINATOR: Better Automated UI Test Case Prioritization}, DOI={10.1145/3338906.3340448}, abstractNote={Automated UI testing is an important component of the continuous integration process of software development. A modern web-based UI is an amalgam of reports from dozens of microservices written by multiple teams. Queries on a page that opens up another will fail if any of that page's microservices fails. As a result, the overall cost for automated UI testing is high since the UI elements cannot be tested in isolation. For example, the entire automated UI testing suite at LexisNexis takes around 30 hours (3-5 hours on the cloud) to execute, which slows down the continuous integration process. To mitigate this problem and give developers faster feedback on their code, test case prioritization techniques are used to reorder the automated UI test cases so that more failures can be detected earlier. Given that much of the automated UI testing is "black box" in nature, very little information (only the test case descriptions and testing results) can be utilized to prioritize these automated UI test cases. Hence, this paper evaluates 17 "black box" test case prioritization approaches that do not rely on source code information. Among these, we propose a novel TCP approach, that dynamically re-prioritizes the test cases when new failures are detected, by applying and adapting a state of the art framework from the total recall problem. Experimental results on LexisNexis automated UI testing data show that our new approach (which we call TERMINATOR), outperformed prior state of the art approaches in terms of failure detection rates with negligible CPU overhead.}, journal={ESEC/FSE'2019: PROCEEDINGS OF THE 2019 27TH ACM JOINT MEETING ON EUROPEAN SOFTWARE ENGINEERING CONFERENCE AND SYMPOSIUM ON THE FOUNDATIONS OF SOFTWARE ENGINEERING}, author={Yu, Zhe and Fahid, Fahmid and Menzies, Tim and Rothermel, Gregg and Patrick, Kyle and Cherian, Snehit}, year={2019}, pages={883–894} } @article{menzies_2019, title={Take Control (On the Unreasonable Effectiveness of Software Analytics)}, ISBN={978-1-7281-1760-7}, DOI={10.1109/ICSE-SEIP.2019.00037}, abstractNote={The goal of software analytics should be insight; i.e. surprises that make us change the way we do business (in our case, the business of software construction, maintenance and evolution). So what insights have we learned from software analytics? Our field is rife with any number of truisms that are commonly quoted but rarely checked. That stops now. Using software analytics, we can check, and prune, many of those truisms. Specifically, the number of variables required to make predictions about SE projects is remarkably small-which means that (a) most of the things we think might affect software quality have little impact in practice; (b) controlling just a few key variables can be enough to improve software quality.}, journal={2019 IEEE/ACM 41ST INTERNATIONAL CONFERENCE ON SOFTWARE ENGINEERING: SOFTWARE ENGINEERING IN PRACTICE (ICSE-SEIP 2019)}, author={Menzies, Tim}, year={2019}, pages={265–266} } @article{wang_yang_krishna_menzies_wang_2019, title={iSENSE: Completion-Aware Crowdtesting Management}, ISSN={["0270-5257"]}, DOI={10.1109/ICSE.2019.00097}, abstractNote={Trade-offs such as "how much testing is enough" are critical yet challenging project decisions in software engineering. Most existing approaches adopt risk-driven or value-based analysis to prioritize test cases and minimize test runs. However, none of these is applicable to the emerging crowd testing paradigm where task requesters typically have no control over online crowdworkers's dynamic behavior and uncertain performance. In current practice, deciding when to close a crowdtesting task is largely done by guesswork due to lack of decision support. This paper intends to fill this gap by introducing automated decision support for monitoring and determining appropriate time to close the crowdtesting tasks. First, this paper investigates the necessity and feasibility of close prediction of crowdtesting tasks based on industrial dataset. Then,it designs 8 methods for close prediction, based on various models including the bug trend, bug arrival model, capture-recapture model.Finally, the evaluation is conducted on 218 crowdtesting tasks from one of the largest crowdtesting platforms in China, and the results show that a median of 91% bugs can be detected with 49% saved cost.}, journal={2019 IEEE/ACM 41ST INTERNATIONAL CONFERENCE ON SOFTWARE ENGINEERING (ICSE 2019)}, author={Wang, Junjie and Yang, Ye and Krishna, Rahul and Menzies, Tim and Wang, Qing}, year={2019}, pages={912–923} } @article{yang_falessi_menzies_hihn_2018, title={Actionable Analytics for Software Engineering INTRODUCTION}, volume={35}, ISSN={["1937-4194"]}, DOI={10.1109/ms.2017.4541039}, abstractNote={Although intensive research on software analytics has been going on for nearly a decade, a repeated complaint in software analytics is that industrial practitioners find it hard to apply the results generated from data science. This theme issue aims to reflect on actionable analytics for software engineering and to document a catalog of success stories in which analytics has been proven actionable and useful, in some significant way, in an organization. This issue features five articles covering promising analytical methods for improving change triage, strategic maintenance, and team robustness, as well as the success stories of applying analytical tools during an organizational transformation.}, number={1}, journal={IEEE SOFTWARE}, author={Yang, Ye and Falessi, Davide and Menzies, Tim and Hihn, Jairus}, year={2018}, pages={51–53} } @article{chen_fu_krishna_menzies_2018, title={Applications of Psychological Science for Actionable Analytics}, DOI={10.1145/3236024.3236050}, abstractNote={According to psychological scientists, humans understand models that most match their own internal models, which they characterize as lists of "heuristic"s (i.e. lists of very succinct rules). One such heuristic rule generator is the Fast-and-Frugal Trees (FFT) preferred by psychological scientists. Despite their successful use in many applied domains, FFTs have not been applied in software analytics. Accordingly, this paper assesses FFTs for software analytics. We find that FFTs are remarkably effective in that their models are very succinct (5 lines or less describing a binary decision tree) while also outperforming result from very recent, top-level, conference papers. Also, when we restrict training data to operational attributes (i.e., those attributes that are frequently changed by developers), the performance of FFTs are not effected (while the performance of other learners can vary wildly). Our conclusions are two-fold. Firstly, there is much that software analytics community could learn from psychological science. Secondly, proponents of complex methods should always baseline those methods against simpler alternatives. For example, FFTs could be used as a standard baseline learner against which other software analytics tools are compared.}, journal={ESEC/FSE'18: PROCEEDINGS OF THE 2018 26TH ACM JOINT MEETING ON EUROPEAN SOFTWARE ENGINEERING CONFERENCE AND SYMPOSIUM ON THE FOUNDATIONS OF SOFTWARE ENGINEERING}, author={Chen, Di and Fu, Wei and Krishna, Rahul and Menzies, Tim}, year={2018}, pages={456–467} } @article{chen_nair_menzies_2018, title={Beyond evolutionary algorithms for search-based software engineering}, volume={95}, ISSN={["1873-6025"]}, DOI={10.1016/j.infsof.2017.08.007}, abstractNote={Context: Evolutionary algorithms typically require a large number of evaluations (of solutions) to converge - which can be very slow and expensive to evaluate.Objective: To solve search-based software engineering (SE) problems, using fewer evaluations than evolutionary methods.Method: Instead of mutating a small population, we build a very large initial population which is then culled using a recursive bi-clustering chop approach. We evaluate this approach on multiple SE models, unconstrained as well as constrained, and compare its performance with standard evolutionary algorithms. Results: Using just a few evaluations (under 100), we can obtain comparable results to state-of-the-art evolutionary algorithms.Conclusion: Just because something works, and is widespread use, does not necessarily mean that there is no value in seeking methods to improve that method. Before undertaking search-based SE optimization tasks using traditional EAs, it is recommended to try other techniques, like those explored here, to obtain the same results with fewer evaluations.}, journal={INFORMATION AND SOFTWARE TECHNOLOGY}, author={Chen, Jianfeng and Nair, Vivek and Menzies, Tim}, year={2018}, month={Mar}, pages={281–294} } @article{nair_agrawal_chen_fu_mathew_menzies_minku_wagner_yu_2018, title={Data-Driven Search-based Software Engineering}, ISSN={["2160-1852"]}, DOI={10.1145/3196398.3196442}, abstractNote={This paper introduces Data-Driven Search-based Software Engineering (DSE), which combines insights from Mining Software Repositories (MSR) and Search-based Software Engineering (SBSE). While MSR formulates software engineering problems as data mining problems, SBSE reformulates Software Engineering (SE) problems as optimization problems and use meta-heuristic algorithms to solve them. Both MSR and SBSE share the common goal of providing insights to improve software engineering. The algorithms used in these two areas also have intrinsic relationships. We, therefore, argue that combining these two fields is useful for situations (a)~which require learning from a large data source or (b)~when optimizers need to know the lay of the land to find better solutions, faster. This paper aims to answer the following three questions: (1) What are the various topics addressed by DSE?, (2) What types of data are used by the researchers in this area?, and (3) What research approaches do researchers use? The paper briefly sets out to act as a practical guide to develop new DSE techniques and also to serve as a teaching resource. This paper also presents a resource (tiny.cc/data-se) for exploring DSE. The resource contains 89 artifacts which are related to DSE, divided into 13 groups such as requirements engineering, software product lines, software processes. All the materials in this repository have been used in recent software engineering papers; i.e., for all this material, there exist baseline results against which researchers can comparatively assess their new ideas.}, journal={2018 IEEE/ACM 15TH INTERNATIONAL CONFERENCE ON MINING SOFTWARE REPOSITORIES (MSR)}, author={Nair, Vivek and Agrawal, Amritanshu and Chen, Jianfeng and Fu, Wei and Mathew, George and Menzies, Tim and Minku, Leandro and Wagner, Markus and Yu, Zhe}, year={2018}, pages={341–352} } @article{nair_menzies_siegmund_apel_2018, title={Faster discovery of faster system configurations with spectral learning}, volume={25}, ISSN={["1573-7535"]}, DOI={10.1007/s10515-017-0225-2}, abstractNote={Despite the huge spread and economical importance of configurable software systems, there is unsatisfactory support in utilizing the full potential of these systems with respect to finding performance-optimal configurations. Prior work on predicting the performance of software configurations suffered from either (a) requiring far too many sample configurations or (b) large variances in their predictions. Both these problems can be avoided using the WHAT spectral learner. WHAT’s innovation is the use of the spectrum (eigenvalues) of the distance matrix between the configurations of a configurable software system, to perform dimensionality reduction. Within that reduced configuration space, many closely associated configurations can be studied by executing only a few sample configurations. For the subject systems studied here, a few dozen samples yield accurate and stable predictors—less than 10% prediction error, with a standard deviation of less than 2%. When compared to the state of the art, WHAT (a) requires 2–10 times fewer samples to achieve similar prediction accuracies, and (b) its predictions are more stable (i.e., have lower standard deviation). Furthermore, we demonstrate that predictive models generated by WHAT can be used by optimizers to discover system configurations that closely approach the optimal performance.}, number={2}, journal={AUTOMATED SOFTWARE ENGINEERING}, author={Nair, Vivek and Menzies, Tim and Siegmund, Norbert and Apel, Sven}, year={2018}, month={Jun}, pages={247–277} } @article{yu_kraft_menzies_2018, title={Finding better active learners for faster literature reviews}, volume={23}, ISSN={["1573-7616"]}, url={https://doi.org/10.1007/s10664-017-9587-0}, DOI={10.1007/s10664-017-9587-0}, abstractNote={Literature reviews can be time-consuming and tedious to complete. By cataloging and refactoring three state-of-the-art active learning techniques from evidence-based medicine and legal electronic discovery, this paper finds and implements FASTREAD, a faster technique for studying a large corpus of documents, combining and parametrizing the most efficient active learning algorithms. This paper assesses FASTREAD using datasets generated from existing SE literature reviews (Hall, Wahono, Radjenović, Kitchenham et al.). Compared to manual methods, FASTREAD lets researchers find 95% relevant studies after reviewing an order of magnitude fewer papers. Compared to other state-of-the-art automatic methods, FASTREAD reviews 20–50% fewer studies while finding same number of relevant primary studies in a systematic literature review.}, number={6}, journal={EMPIRICAL SOFTWARE ENGINEERING}, publisher={Springer Nature}, author={Yu, Zhe and Kraft, Nicholas A. and Menzies, Tim}, year={2018}, month={Dec}, pages={3161–3186} } @article{petke_menzies_2018, title={Guest Editorial for the Special Section from the 9th International Symposium on Search Based Software Engineering}, volume={104}, ISSN={["1873-6025"]}, DOI={10.1016/j.infsof.2018.10.002}, abstractNote={Context: Organizations increasingly develop software in a distributed manner. The Cloud provides an environment to create and maintain software-based products and services. Currently, it is unknown which software processes are suited for Cloud-based development and what their effects in specific contexts are.Objective: We aim at better understanding the software process applied to distributed software development using the Cloud as development environment. We further aim at providing an instrument, which helps project managers comparing different solution approaches and to adapt team processes to improve future project activities and outcomes.Method: We provide a simulation model, which helps analyzing different project parameters and their impact on projects performed in the Cloud. To evaluate the simulation model, we conduct different analyses using a Scrumban process and data from a project executed in Finland and Spain. An extra adaptation of the simulation model for Scrum and Kanban was used to evaluate the suitability of the simulation model to cover further process models.Results: A comparison of the real project data with the results obtained from the different simulation runs shows the simulation producing results close to the real data, and we could successfully replicate a distributed software project. Furthermore, we could show that the simulation model is suitable to address further process models.Conclusion: The simulator helps reproducing activities, developers, and events in the project, and it helps analyzing potential tradeoffs, e.g., regarding throughput, total time, project size, team size and work-in-progress limits. Furthermore, the simulation model supports project managers selecting the most suitable planning alternative thus supporting decision-making processes.}, journal={INFORMATION AND SOFTWARE TECHNOLOGY}, author={Petke, Justyna and Menzies, Tim}, year={2018}, month={Dec}, pages={194–194} } @article{nam_fu_kim_menzies_tan_2018, title={Heterogeneous Defect Prediction}, volume={44}, ISSN={["1939-3520"]}, url={https://doi.org/10.1109/TSE.2017.2720603}, DOI={10.1109/TSE.2017.2720603}, abstractNote={Many recent studies have documented the success of cross-project defect prediction (CPDP) to predict defects for new projects lacking in defect data by using prediction models built by other projects. However, most studies share the same limitations: it requires homogeneous data; i.e., different projects must describe themselves using the same metrics. This paper presents methods for heterogeneous defect prediction (HDP) that matches up different metrics in different projects. Metric matching for HDP requires a “large enough” sample of distributions in the source and target projects-which raises the question on how large is “large enough” for effective heterogeneous defect prediction. This paper shows that empirically and theoretically, “large enough” may be very small indeed. For example, using a mathematical model of defect prediction, we identify categories of data sets were as few as 50 instances are enough to build a defect prediction model. Our conclusion for this work is that, even when projects use different metric sets, it is possible to quickly transfer lessons learned about defect prediction.}, number={9}, journal={IEEE TRANSACTIONS ON SOFTWARE ENGINEERING}, publisher={Institute of Electrical and Electronics Engineers (IEEE)}, author={Nam, Jaechang and Fu, Wei and Kim, Sunghun and Menzies, Tim and Tan, Lin}, year={2018}, month={Sep}, pages={874–896} } @article{agrawal_menzies_2018, title={Is "Better Data" Better Than "Better Data Miners"? On the Benefits of Tuning SMOTE for Defect Prediction}, DOI={10.1145/3180155.3180197}, abstractNote={We report and fix an important systematic error in prior studies that ranked classifiers for software analytics. Those studies did not (a) assess classifiers on multiple criteria and they did not (b) study how variations in the data affect the results. Hence, this paper applies (a) multi-performance criteria while (b) fixing the weaker regions of the training data (using SMOTUNED, which is an auto-tuning version of SMOTE). This approach leads to dramatically large increases in software defect predictions when applied in a 5*5 cross-validation study for 3,681 JAVA classes (containing over a million lines of code) from open source systems, SMOTUNED increased AUC and recall by 60% and 20% respectively. These improvements are independent of the classifier used to predict for defects. Same kind of pattern (improvement) was observed when a comparative analysis of SMOTE and SMOTUNED was done against the most recent class imbalance technique. In conclusion, for software analytic tasks like defect prediction, (1) data pre-processing can be more important than classifier choice, (2) ranking studies are incomplete without such pre-processing, and (3) SMOTUNED is a promising candidate for pre-processing.}, journal={PROCEEDINGS 2018 IEEE/ACM 40TH INTERNATIONAL CONFERENCE ON SOFTWARE ENGINEERING (ICSE)}, author={Agrawal, Amritanshu and Menzies, Tim}, year={2018}, pages={1050–1061} } @article{hsu_nair_menzies_freeh_2018, title={Micky: A Cheaper Alternative for Selecting Cloud Instances}, DOI={10.1109/CLOUD.2018.00058}, abstractNote={Most cloud computing optimizers explore and improve one workload at a time. When optimizing many workloads, the single-optimizer approach can be prohibitively expensive. Accordingly, we examine "collective optimizer" that concurrently explore and improve a set of workloads significantly reducing the measurement costs. Our large-scale empirical study shows that there is often a single cloud configuration which is surprisingly near-optimal for most workloads. Consequently, we create a collective-optimizer, MICKY, that reformulates the task of finding the near-optimal cloud configuration as a multi-armed bandit problem. MICKY efficiently balances exploration (of new cloud configurations) and exploitation (of known good cloud configuration). Our experiments show that MICKY can achieve on average 8.6 times reduction in measurement cost as compared to the state-of-the-art method while finding near-optimal solutions. Hence we propose MICKY as the basis of a practical collective optimization method for finding good cloud configurations (based on various constraints such as budget and tolerance to near-optimal configurations)}, journal={PROCEEDINGS 2018 IEEE 11TH INTERNATIONAL CONFERENCE ON CLOUD COMPUTING (CLOUD)}, author={Hsu, Chin-Jung and Nair, Vivek and Menzies, Tim and Freeh, Vincent}, year={2018}, pages={409–416} } @article{chen_menzies_2018, title={RIOT: a Stochastic-based Method for Workflow Scheduling in the Cloud}, DOI={10.1109/CLOUD.2018.00047}, abstractNote={Cloud computing provides engineers or scientists a place to run complex computing tasks. Finding a workflows’s deployment configuration in a cloud environment is not easy. Traditional workflow scheduling algorithms were based on some heuristics, e.g. reliability greedy, cost greedy, cost-time balancing, etc., or more recently, the meta-heuristic methods, such as genetic algorithms. These methods are very slow and not suitable for rescheduling in dynamic cloud environment. This paper introduces RIOT (Randomized Instance Order Types), a stochastic based method for workflow scheduling. RIOT groups the tasks in the workflow into virtual machines via a probability model and then uses an effective surrogate based method to assess large amount of potential schedulings. Experiments in dozens of study cases showed that RIOT executes tens of times faster than traditional methods while generating comparable results to other methods.}, journal={PROCEEDINGS 2018 IEEE 11TH INTERNATIONAL CONFERENCE ON CLOUD COMPUTING (CLOUD)}, author={Chen, Jianfeng and Menzies, Tim}, year={2018}, pages={318–325} } @article{menzies_2018, title={The Unreasonable Effectiveness of Software Analytics}, volume={35}, ISSN={["1937-4194"]}, DOI={10.1109/ms.2018.1661323}, abstractNote={In theory, software analytics shouldn’t work because software project behavior shouldn’t be predictable. However, it does. Why?}, number={2}, journal={IEEE SOFTWARE}, author={Menzies, Tim}, year={2018}, pages={96–98} } @article{yu_menzies_2018, title={Total Recall, Language Processing, and Software Engineering}, DOI={10.1145/3283812.3283818}, abstractNote={A broad class of software engineering problems can be generalized as the "total recall problem". This short paper claims that identifying and exploring the total recall problems in software engineering is an important task with wide applicability. To make that case, we show that by applying and adapting the state of the art active learning and natural language processing algorithms for solving the total recall problem, two important software engineering tasks can also be addressed : (a) supporting large literature reviews and (b) identifying software security vulnerabilities. Furthermore, we conjecture that (c) test case prioritization and (d) static warning identification can also be generalized as and benefit from the total recall problem. The widespread applicability of "total recall" to software engineering suggests that there exists some underlying framework that encompasses not just natural language processing, but a wide range of important software engineering tasks.}, journal={PROCEEDINGS OF THE 4TH ACM SIGSOFT INTERNATIONAL WORKSHOP ON NLP FOR SOFTWARE ENGINEERING (NL4SE '18)}, author={Yu, Zhe and Menzies, Tim}, year={2018}, pages={10–13} } @article{prikladnicki_menzies_2018, title={VOICE OF EVIDENCE From Voice of Evidence to Redirections}, volume={35}, ISSN={["1937-4194"]}, DOI={10.1109/ms.2017.4541053}, abstractNote={The Voice of Experience department is being relaunched as Redirections, which will focus on the surprises in software engineering.}, number={1}, journal={IEEE SOFTWARE}, author={Prikladnicki, Rafael and Menzies, Tim}, year={2018}, pages={11–13} } @article{agrawal_fu_menzies_2018, title={What is wrong with topic modeling? And how to fix it using search-based software engineering}, volume={98}, ISSN={["1873-6025"]}, url={https://doi.org/10.1016/j.infsof.2018.02.005}, DOI={10.1016/j.infsof.2018.02.005}, abstractNote={Topic modeling finds human-readable structures in unstructured textual data. A widely used topic modeling technique is Latent Dirichlet allocation. When running on different datasets, LDA suffers from “order effects”, i.e., different topics are generated if the order of training data is shuffled. Such order effects introduce a systematic error for any study. This error can relate to misleading results; specifically, inaccurate topic descriptions and a reduction in the efficacy of text mining classification results. To provide a method in which distributions generated by LDA are more stable and can be used for further analysis. We use LDADE, a search-based software engineering tool which uses Differential Evolution (DE) to tune the LDA’s parameters. LDADE is evaluated on data from a programmer information exchange site (Stackoverflow), title and abstract text of thousands of Software Engineering (SE) papers, and software defect reports from NASA. Results were collected across different implementations of LDA (Python+Scikit-Learn, Scala+Spark) across Linux platform and for different kinds of LDAs (VEM, Gibbs sampling). Results were scored via topic stability and text mining classification accuracy. In all treatments: (i) standard LDA exhibits very large topic instability; (ii) LDADE’s tunings dramatically reduce cluster instability; (iii) LDADE also leads to improved performances for supervised as well as unsupervised learning. Due to topic instability, using standard LDA with its “off-the-shelf” settings should now be depreciated. Also, in future, we should require SE papers that use LDA to test and (if needed) mitigate LDA topic instability. Finally, LDADE is a candidate technology for effectively and efficiently reducing that instability.}, journal={INFORMATION AND SOFTWARE TECHNOLOGY}, publisher={Elsevier BV}, author={Agrawal, Amritanshu and Fu, Wei and Menzies, Tim}, year={2018}, month={Jun}, pages={74–88} } @article{kessentini_menzies_2017, title={A guest editorial: special issue on search based software engineering and data mining}, volume={24}, ISSN={["1573-7535"]}, DOI={10.1007/s10515-017-0217-2}, number={3}, journal={AUTOMATED SOFTWARE ENGINEERING}, author={Kessentini, Marouane and Menzies, Tim}, year={2017}, month={Sep}, pages={573–574} } @article{menzies_nichols_shull_layman_2017, title={Are delayed issues harder to resolve? Revisiting cost-to-fix of defects throughout the lifecycle}, volume={22}, ISSN={["1573-7616"]}, url={https://doi.org/10.1007/s10664-016-9469-x}, DOI={10.1007/s10664-016-9469-x}, abstractNote={Many practitioners and academics believe in a delayed issue effect (DIE); i.e. the longer an issue lingers in the system, the more effort it requires to resolve. This belief is often used to justify major investments in new development processes that promise to retire more issues sooner. This paper tests for the delayed issue effect in 171 software projects conducted around the world in the period from 2006–2014. To the best of our knowledge, this is the largest study yet published on this effect. We found no evidence for the delayed issue effect; i.e. the effort to resolve issues in a later phase was not consistently or substantially greater than when issues were resolved soon after their introduction. This paper documents the above study and explores reasons for this mismatch between this common rule of thumb and empirical data. In summary, DIE is not some constant across all projects. Rather, DIE might be an historical relic that occurs intermittently only in certain kinds of projects. This is a significant result since it predicts that new development processes that promise to faster retire more issues will not have a guaranteed return on investment (depending on the context where applied), and that a long-held truth in software engineering should not be considered a global truism.}, number={4}, journal={EMPIRICAL SOFTWARE ENGINEERING}, publisher={Springer Science and Business Media LLC}, author={Menzies, Tim and Nichols, William and Shull, Forrest and Layman, Lucas}, year={2017}, month={Aug}, pages={1903–1935} } @article{krishna_menzies_layman_2017, title={Less is more: Minimizing code reorganization using XTREE}, volume={88}, ISSN={["1873-6025"]}, DOI={10.1016/j.infsof.2017.03.012}, abstractNote={Context: Developers use bad code smells to guide code reorganization. Yet developers, text books, tools, and researchers disagree on which bad smells are important. Objective: To evaluate the likelihood that a code reorganization to address bad code smells will yield improvement in the defect-proneness of the code. Method: We introduce XTREE, a tool that analyzes a historical log of defects seen previously in the code and generates a set of useful code changes. Any bad smell that requires changes outside of that set can be deprioritized (since there is no historical evidence that the bad smell causes any problems). Evaluation: We evaluate XTREE's recommendations for bad smell improvement against recommendations from previous work (Shatnawi, Alves, and Borges) using multiple data sets of code metrics and defect counts. Results: Code modules that are changed in response to XTREE's recommendations contain significantly fewer defects than recommendations from previous studies. Further, XTREE endorses changes to very few code metrics, and the bad smell recommendations (learned from previous studies) are not universal to all software projects. Conclusion: Before undertaking a code reorganization based on a bad smell report, use a tool like XTREE to check and ignore any such operations that are useless; i.e. ones which lack evidence in the historical record that it is useful to make that change. Note that this use case applies to both manual code reorganizations proposed by developers as well as those conducted by automatic methods. This recommendation assumes that there is an historical record. If none exists, then the results of this paper could be used as a guide.}, journal={INFORMATION AND SOFTWARE TECHNOLOGY}, author={Krishna, Rahul and Menzies, Tim and Layman, Lucas}, year={2017}, month={Aug}, pages={53–66} } @article{menzies_yang_mathew_boehm_hihn_2017, title={Negative results for software effort estimation}, volume={22}, ISSN={["1573-7616"]}, DOI={10.1007/s10664-016-9472-2}, abstractNote={More than half the literature on software effort estimation (SEE) focuses on comparisons of new estimation methods. Surprisingly, there are no studies comparing state of the art latest methods with decades-old approaches. Accordingly, this paper takes five steps to check if new SEE methods generated better estimates than older methods. Firstly, collect effort estimation methods ranging from “classical” COCOMO (parametric estimation over a pre-determined set of attributes) to “modern” (reasoning via analogy using spectral-based clustering plus instance and feature selection, and a recent “baseline method” proposed in ACM Transactions on Software Engineering). Secondly, catalog the list of objections that lead to the development of post-COCOMO estimation methods. Thirdly, characterize each of those objections as a comparison between newer and older estimation methods. Fourthly, using four COCOMO-style data sets (from 1991, 2000, 2005, 2010) and run those comparisons experiments. Fifthly, compare the performance of the different estimators using a Scott-Knott procedure using (i) the A12 effect size to rule out “small” differences and (ii) a 99 % confident bootstrap procedure to check for statistically different groupings of treatments. The major negative result of this paper is that for the COCOMO data sets, nothing we studied did any better than Boehms original procedure. Hence, we conclude that when COCOMO-style attributes are available, we strongly recommend (i) using that data and (ii) use COCOMO to generate predictions. We say this since the experiments of this paper show that, at least for effort estimation, how data is collected is more important than what learner is applied to that data.}, number={5}, journal={EMPIRICAL SOFTWARE ENGINEERING}, author={Menzies, Tim and Yang, Ye and Mathew, George and Boehm, Barry and Hihn, Jairus}, year={2017}, month={Oct}, pages={2658–2683} } @article{pandita_jetley_sudarsan_menzies_williams_2017, title={TMAP: Discovering relevant API methods through text mining of API documentation}, volume={29}, ISSN={2047-7473}, url={http://dx.doi.org/10.1002/SMR.1845}, DOI={10.1002/SMR.1845}, abstractNote={Abstract}, number={12}, journal={Journal of Software: Evolution and Process}, publisher={Wiley}, author={Pandita, Rahul and Jetley, Raoul and Sudarsan, Sithu and Menzies, Timothy and Williams, Laurie}, year={2017}, month={Feb}, pages={e1845} } @inproceedings{hihn_saing_huntington_johnson_menzies_mathew_2017, title={The NASA analogy software cost model: A web-based cost analysis tool}, DOI={10.1109/aero.2017.7943730}, abstractNote={This paper provides an overview of the many new features and algorithm updates in the release of the NASA Analogy Software Cost Tool (ASCoT). ASCoT is a web-based tool that provides a suite of estimation tools to support early lifecycle NASA Flight Software analysis. ASCoT employs advanced statistical methods such as Cluster Analysis to provide an analogy based estimate of software delivered lines of code and development effort, a regression based Cost Estimating Relationships (CER) model that estimates cost (dollars), and a COCOMO II based estimate. The ASCoT algorithms are designed to primarily work with system level inputs such as mission type (earth orbiter vs. planetary vs. rover), the number of instruments, and total mission cost. This allows the user to supply a minimal number of mission-level parameters which are better understood early in the life-cycle, rather than a large number of complex inputs.}, booktitle={2017 ieee aerospace conference}, author={Hihn, J. and Saing, M. and Huntington, E. and Johnson, J. and Menzies, Tim and Mathew, G.}, year={2017} } @article{nair_menzies_siegmund_apel_2017, title={Using Bad Learners to Find Good Configurations}, DOI={10.1145/3106237.3106238}, abstractNote={Finding the optimally performing configuration of a software system for a given setting is often challenging. Recent approaches address this challenge by learning performance models based on a sample set of configurations. However, building an accurate performance model can be very expensive (and is often infeasible in practice). The central insight of this paper is that exact performance values (e.g., the response time of a software system) are not required to rank configurations and to identify the optimal one. As shown by our experiments, performance models that are cheap to learn but inaccurate (with respect to the difference between actual and predicted performance) can still be used rank configurations and hence find the optimal configuration. This novel rank-based approach allows us to significantly reduce the cost (in terms of number of measurements of sample configuration) as well as the time required to build performance models. We evaluate our approach with 21 scenarios based on 9 software systems and demonstrate that our approach is beneficial in 16 scenarios; for the remaining 5 scenarios, an accurate model can be built by using very few samples anyway, without the need for a rank-based approach.}, journal={ESEC/FSE 2017: PROCEEDINGS OF THE 2017 11TH JOINT MEETING ON FOUNDATIONS OF SOFTWARE ENGINEERING}, author={Nair, Vivek and Menzies, Tim and Siegmund, Norbert and Apel, Sven}, year={2017}, pages={257–267} } @article{menzies_2016, title={"How not to Do it": Anti-patterns for Data Science in Software Engineering}, DOI={10.1145/2889160.2891047}, abstractNote={Many books and papers describe how to do data science. While those texts are useful, it can also be important to reflect on anti-patterns; i.e. common classes of errors seen when large communities of researchers and commercial software engineers use, and misuse data mining tools. This technical briefing will present those errors and show how to avoid them.}, journal={2016 IEEE/ACM 38TH INTERNATIONAL CONFERENCE ON SOFTWARE ENGINEERING COMPANION (ICSE-C)}, author={Menzies, Tim}, year={2016}, pages={887–887} } @article{nair_menzies_chen_2016, title={An (Accidental) Exploration of Alternatives to Evolutionary Algorithms for SBSE}, volume={9962}, ISBN={["978-3-319-47105-1"]}, ISSN={["0302-9743"]}, DOI={10.1007/978-3-319-47106-8_7}, abstractNote={SBSE researchers often use an evolutionary algorithm to solve various software engineering problems. This paper explores an alternate approach of sampling. This approach is called SWAY (Samplying WAY) and finds the (near) optimal solutions to the problem by (i) creating a larger initial population and (ii) intelligently sampling the solution space to find the best subspace. Unlike evolutionary algorithms, SWAY does not use mutation or cross-over or multi-generational reasoning to find interesting subspaces but relies on the underlying dimensions of the solution space. Experiments with Software Engineering (SE) models shows that SWAY’s performance improvement is competitive with standard MOEAs while, terminating over an order of magnitude faster.}, journal={SEARCH BASED SOFTWARE ENGINEERING, SSBSE 2016}, author={Nair, Vivek and Menzies, Tim and Chen, Jianfeng}, year={2016}, pages={96–111} } @article{menzies_2016, title={Correlation is not causation (or, when not to scream "Eureka!")}, DOI={10.1016/b978-0-12-804206-9.00059-3}, abstractNote={Abstract When we stumble onto some pattern in the data, it is so tempting to send a Eureka! text to the business users. This is a natural response that stems from the excitement of doing science and discovering an effect that no one has ever seen before. Here’s my warning: don’t do it. At least, don’t do it straight away.}, journal={Perspectives on Data Science for Software Engineering}, author={Menzies, Tim}, year={2016}, pages={327–330} } @inproceedings{hihn_juster_johnson_menzies_michael_2016, title={Improving and expanding NASA software cost estimation methods}, DOI={10.1109/aero.2016.7500655}, abstractNote={Estimators and analysts are increasingly being tasked to develop better models and reliable cost estimates in support of program planning and execution. While there has been extensive work on improving parametric methods for cost estimation, there is very little focus on the use of cost models based on analogy and clustering algorithms. In this paper we summarize the results of our research in developing an analogy method for estimating NASA spacecraft flight software using spectral clustering on system characteristics (symbolic non-numerical data) and evaluate its performance by comparing it to a number of the most commonly used estimation methods. The strengths and weaknesses of each method based on their performance are also discussed. The paper concludes with an overview of the analogy estimation tool (ASCoT) developed for use within NASA that implements the recommended analogy algorithm.}, booktitle={2016 ieee aerospace conference}, author={Hihn, J. and Juster, L. and Johnson, J. and Menzies, Tim and Michael, G.}, year={2016} } @article{krall_menzies_davies_2016, title={Learning Mitigations for Pilot Issues When Landing Aircraft (via Multiobjective Optimization and Multiagent Simulations)}, volume={46}, ISSN={["2168-2305"]}, DOI={10.1109/thms.2015.2509980}, abstractNote={We advocate exploring complex models by combining data miners (to find a small set of most critical examples) and of multiobjective optimizers (that focus on those critical examples). An example of such a combination is the GALE optimizer that intelligently explores thousands of scenarios by examining just a few dozen of the most informative examples. GALE-style reasoning enables a very fast, very wide ranging exploration of behaviors, as well as the effects of those behaviors' limitations. This paper applies GALE to the continuous descent approach (CDA) model within the Georgia Tech Work Models that Compute framework. CDA is a model of pilot interactions: with each other and also with the navigation systems critical to safe flight. We show that, using CDA+GALE, it is possible to identify and mitigate factors that make pilots unable to complete all their required tasks in the context of different 1) function allocation strategies, 2) pilot cognitive control strategies, and 3) operational contexts that impact and safe aircraft operation. We also show that other optimization methods can be so slow to run that, without GALE, it might be impractical to find those mitigations.}, number={2}, journal={IEEE TRANSACTIONS ON HUMAN-MACHINE SYSTEMS}, author={Krall, Joseph and Menzies, Tim and Davies, Misty}, year={2016}, month={Apr}, pages={221–230} } @article{menzies_williams_zimmermann_2016, title={Perspectives on data science for software engineering}, DOI={10.1016/b978-0-12-804206-9.00001-5}, abstractNote={Given recent increases in how much data we can collect, and given a shortage in skilled analysts that can assess that data, there now exists more data than people to study it. Consequently, the analysis of real-world data is an exploding field, to say this least. About software projects, a lot of information is recorded in software repositories. Never before have we had so much information about the details on how people collaborate to build software.}, journal={Perspectives on Data Science for Software Engineering}, author={Menzies, Tim and Williams, L. and Zimmermann, T.}, year={2016}, pages={3–6} } @article{menzies_2016, title={Seven principles of inductive software engineering: What we do is different}, DOI={10.1016/b978-0-12-804206-9.00003-9}, abstractNote={Abstract Inductive software engineering is the branch of software engineering focusing on the delivery of data-mining based software applications. Within those data mines, the core problem is induction , which is the extraction of small patterns from larger data sets. Inductive engineers spend much effort trying to understand business goals in order to inductively generate the models that matter the most.}, journal={Perspectives on Data Science for Software Engineering}, author={Menzies, Tim}, year={2016}, pages={13–17} } @article{krishna_menzies_fu_2016, title={Too Much Automation? The Bellwether Effect and Its Implications for Transfer Learning}, ISSN={["1527-1366"]}, DOI={10.1145/2970276.2970339}, abstractNote={“Transfer learning”: is the process of translating quality predictors learned in one data set to another. Transfer learning has been the subject of much recent research. In practice, that research means changing models all the time as transfer learners continually exchange new models to the current project. This paper offers a very simple “bellwether” transfer learner. Given N data sets, we find which one produces the best predictions on all the others. This “bellwether” data set is then used for all subsequent predictions (or, until such time as its predictions start failing-at which point it is wise to seek another bellwether). Bellwethers are interesting since they are very simple to find (just wrap a for-loop around standard data miners). Also, they simplify the task of making general policies in SE since as long as one bellwether remains useful, stable conclusions for N data sets can be achieved just by reasoning over that bellwether. From this, we conclude (1) this bellwether method is a useful (and very simple) transfer learning method; (2) “bellwethers” are a baseline method against which future transfer learners should be compared; (3) sometimes, when building increasingly complex automatic methods, researchers should pause and compare their supposedly more sophisticated method against simpler alternatives.}, journal={2016 31ST IEEE/ACM INTERNATIONAL CONFERENCE ON AUTOMATED SOFTWARE ENGINEERING (ASE)}, author={Krishna, Rahul and Menzies, Tim and Fu, Wei}, year={2016}, pages={122–131} } @article{layman_nikora_meek_menzies_2016, title={Topic Modeling of NASA Space System Problem Reports}, DOI={10.1145/2901739.2901760}, abstractNote={Problem reports at NASA are similar to bug reports: they capture defects found during test, post-launch operational anomalies, and document the investigation and corrective action of the issue. These artifacts are a rich source of lessons learned for NASA, but are expensive to analyze since problem reports are comprised primarily of natural language text. We apply {topic modeling to a corpus of NASA problem reports to extract trends in testing and operational failures. We collected 16,669 problem reports from six NASA space flight missions and applied Latent Dirichlet Allocation topic modeling to the document corpus. We analyze the most popular topics within and across missions, and how popular topics changed over the lifetime of a mission. We find that hardware material and flight software issues are common during the integration and testing phase, while ground station software and equipment issues are more common during the operations phase. We identify a number of challenges in topic modeling for trend analysis: 1) that the process of selecting the topic modeling parameters lacks definitive guidance, 2) defining semantically-meaningful topic labels requires non-trivial effort and domain expertise, 3) topic models derived from the combined corpus of the six missions were biased toward the larger missions, and 4) topics must be semantically distinct as well as cohesive to be useful. Nonetheless, topic modeling can identify problem themes within missions and across mission lifetimes, providing useful feedback to engineers and project managers.}, journal={13TH WORKING CONFERENCE ON MINING SOFTWARE REPOSITORIES (MSR 2016)}, author={Layman, Lucas and Nikora, Allen P. and Meek, Joshua and Menzies, Tim}, year={2016}, pages={303–314} } @article{fu_menzies_shen_2016, title={Tuning for software analytics: Is it really necessary?}, volume={76}, ISSN={0950-5849}, url={http://dx.doi.org/10.1016/j.infsof.2016.04.017}, DOI={10.1016/j.infsof.2016.04.017}, abstractNote={Context: Data miners have been widely used in software engineering to, say, generate defect predictors from static code measures. Such static code defect predictors perform well compared to manual methods, and they are easy to use and useful to use. But one of the “black arts” of data mining is setting the tunings that control the miner. Objective: We seek simple, automatic, and very effective method for finding those tunings. Method: For each experiment with different data sets (from open source JAVA systems), we ran differential evolution as an optimizer to explore the tuning space (as a first step) then tested the tunings using hold-out data. Results: Contrary to our prior expectations, we found these tunings were remarkably simple: it only required tens, not thousands, of attempts to obtain very good results. For example, when learning software defect predictors, this method can quickly find tunings that alter detection precision from 0% to 60%. Conclusion: Since (1) the improvements are so large, and (2) the tuning is so simple, we need to change standard methods in software analytics. At least for defect prediction, it is no longer enough to just run a data miner and present the result without conducting a tuning optimization study. The implication for other kinds of analytics is now an open and pressing issue.}, journal={Information and Software Technology}, publisher={Elsevier BV}, author={Fu, Wei and Menzies, Tim and Shen, Xipeng}, year={2016}, month={Aug}, pages={135–146} } @article{baresi_menzies_metzger_zimmermann_2015, title={1st International Workshop on Big Data Software Engineering (BIGDSE 2015)}, DOI={10.1109/icse.2015.308}, abstractNote={Big Data is about extracting valuable information from data in order to use it in intelligent ways such as to revolutionize decision-making in businesses, science and society. BIGDSE 2015 discusses the link between Big Data and software engineering and critically looks into issues such as cost-benefit of big data.}, journal={2015 IEEE/ACM 37th IEEE International Conference on Software Engineering, Vol 2}, author={Baresi, Luciano and Menzies, Tim and Metzger, Andreas and Zimmermann, Thomas}, year={2015}, pages={965–966} } @article{krishna_menzies_2015, title={Actionable = Cluster plus Contrast?}, DOI={10.1109/asew.2015.23}, abstractNote={There are many algorithms for data classification such as C4.5, Naive Bayes, etc. Are these enough for learning actionable analytics? Or should we be supporting another kind of reasoning? This paper explores two approaches for learning minimal, yet effective, changes to software project artifacts.}, journal={2015 30TH IEEE/ACM INTERNATIONAL CONFERENCE ON AUTOMATED SOFTWARE ENGINEERING WORKSHOP (ASEW)}, author={Krishna, Rahul and Menzies, Tim}, year={2015}, pages={14–17} } @article{menzies_2015, title={Cross-Project Data for Software Engineering}, volume={48}, ISSN={["1558-0814"]}, DOI={10.1109/mc.2015.381}, abstractNote={This installment of Computer's series highlighting the work published in IEEE Computer Society journals comes from IEEE Transactions on Software Engineering.}, number={12}, journal={COMPUTER}, author={Menzies, Tim}, year={2015}, month={Dec}, pages={6–6} } @article{hihn_menzies_2015, title={Data Mining Methods and Cost Estimation Models Why is it so hard to infuse new ideas?}, DOI={10.1109/asew.2015.27}, abstractNote={Infusing new technologies and methods is hard and can often be described as "banging ones head on a brick wall". The is especially true when trying to get project managers, systems, engineers and cost analysts to add a radically new tool to their tool box. In this paper we suggest that the underlying causes are rooted in the fact that the different players have fundamental differences in mental models, vocabulary and objectives. We based this work on lessons learned from ten years of working on the infusion of software costing models into NASA. The good news is that, lately, a crack has begun to appear in what was previously a brick wall.}, journal={2015 30TH IEEE/ACM INTERNATIONAL CONFERENCE ON AUTOMATED SOFTWARE ENGINEERING WORKSHOP (ASEW)}, author={Hihn, Jairus and Menzies, Tim}, year={2015}, pages={5–9} } @article{krall_menzies_davies_2015, title={GALE: Geometric Active Learning for Search-Based Software Engineering}, volume={41}, ISSN={["1939-3520"]}, DOI={10.1109/tse.2015.2432024}, abstractNote={Multi-objective evolutionary algorithms (MOEAs) help software engineers find novel solutions to complex problems. When automatic tools explore too many options, they are slow to use and hard to comprehend. GALE is a near-linear time MOEA that builds a piecewise approximation to the surface of best solutions along the Pareto frontier. For each piece, GALE mutates solutions towards the better end. In numerous case studies, GALE finds comparable solutions to standard methods (NSGA-II, SPEA2) using far fewer evaluations (e.g. 20 evaluations, not 1,000). GALE is recommended when a model is expensive to evaluate, or when some audience needs to browse and understand how an MOEA has made its conclusions.}, number={10}, journal={IEEE TRANSACTIONS ON SOFTWARE ENGINEERING}, author={Krall, Joseph and Menzies, Tim and Davies, Misty}, year={2015}, month={Oct}, pages={1001–1018} } @article{harrison_menzies_2015, title={Guest editorial: special issue on realizing AI synergies in software engineering}, volume={22}, ISSN={["1573-7535"]}, DOI={10.1007/s10515-014-0174-y}, number={1}, journal={AUTOMATED SOFTWARE ENGINEERING}, author={Harrison, Rachel and Menzies, Tim}, year={2015}, month={Mar}, pages={1–2} } @article{harrison_menzies_2015, title={Guest editorial: special issue on realizing AI synergies in software engineering (part 2)}, volume={22}, DOI={10.1007/s10515-014-0177-8}, number={2}, journal={Automated Software Engineering}, author={Harrison, R. and Menzies, Tim}, year={2015}, pages={143–144} } @article{menzies_pasareanu_2015, title={Guest editorial: special multi-issue on selected topics in Automated Software Engineering}, volume={22}, ISSN={["1573-7535"]}, DOI={10.1007/s10515-015-0180-8}, number={3}, journal={AUTOMATED SOFTWARE ENGINEERING}, author={Menzies, Tim and Pasareanu, Corina}, year={2015}, month={Sep}, pages={289–290} } @article{menzies_pasareanu_2015, title={Guest editorial: special multi-issue on selected topics in automated software engineering}, volume={22}, ISSN={0928-8910 1573-7535}, url={http://dx.doi.org/10.1007/S10515-015-0181-7}, DOI={10.1007/S10515-015-0181-7}, number={4}, journal={Automated Software Engineering}, publisher={Springer Science and Business Media LLC}, author={Menzies, Tim and Pasareanu, Corina}, year={2015}, month={Jul}, pages={437–438} } @inproceedings{peters_menzies_layman_2015, title={LACE2: Better privacy-preserving data sharing for cross project defect prediction}, DOI={10.1109/icse.2015.92}, abstractNote={Before a community can learn general principles, it must share individual experiences. Data sharing is the fundamental step of cross project defect prediction, i.e. the process of using data from one project to predict for defects in another. Prior work on secure data sharing allowed data owners to share their data on a single-party basis for defect prediction via data minimization and obfuscation. However the studied method did not consider that bigger data required the data owner to share more of their data. In this paper, we extend previous work with LACE2 which reduces the amount of data shared by using multi-party data sharing. Here data owners incrementally add data to a cache passed among them and contribute "interesting" data that are not similar to the current content of the cache. Also, before data owner i passes the cache to data owner j, privacy is preserved by applying obfuscation algorithms to hide project details. The experiments of this paper show that (a) LACE2 is comparatively less expensive than the single-party approach and (b) the multi-party approach of LACE2 yields higher privacy than the prior approach without damaging predictive efficacy (indeed, in some cases, LACE2 leads to better defect predictors).}, booktitle={2015 IEEE/ACM 37th IEEE International Conference on Software Engineering, Vol 1}, author={Peters, F. and Menzies, Tim and Layman, L.}, year={2015}, pages={801–811} } @article{partington_menzies_colburn_saelens_glanz_2015, title={Reduced-Item Food Audits Based on the Nutrition Environment Measures Surveys}, volume={49}, ISSN={0749-3797}, url={http://dx.doi.org/10.1016/J.AMEPRE.2015.04.036}, DOI={10.1016/J.AMEPRE.2015.04.036}, abstractNote={The community food environment may contribute to obesity by influencing food choice. Store and restaurant audits are increasingly common methods for assessing food environments, but are time consuming and costly. A valid, reliable brief measurement tool is needed. The purpose of this study was to develop and validate reduced-item food environment audit tools for stores and restaurants.Nutrition Environment Measures Surveys for stores (NEMS-S) and restaurants (NEMS-R) were completed in 820 stores and 1,795 restaurants in West Virginia, San Diego, and Seattle. Data mining techniques (correlation-based feature selection and linear regression) were used to identify survey items highly correlated to total survey scores and produce reduced-item audit tools that were subsequently validated against full NEMS surveys. Regression coefficients were used as weights that were applied to reduced-item tool items to generate comparable scores to full NEMS surveys. Data were collected and analyzed in 2008-2013.The reduced-item tools included eight items for grocery, ten for convenience, seven for variety, and five for other stores; and 16 items for sit-down, 14 for fast casual, 19 for fast food, and 13 for specialty restaurants-10% of the full NEMS-S and 25% of the full NEMS-R. There were no significant differences in median scores for varying types of retail food outlets when compared to the full survey scores. Median in-store audit time was reduced 25%-50%.Reduced-item audit tools can reduce the burden and complexity of large-scale or repeated assessments of the retail food environment without compromising measurement quality.}, number={4}, journal={American Journal of Preventive Medicine}, publisher={Elsevier BV}, author={Partington, Susan N. and Menzies, Tim J. and Colburn, Trina A. and Saelens, Brian E. and Glanz, Karen}, year={2015}, month={Oct}, pages={e23–e33} } @article{menzies_minku_peters_2015, title={The Art and Science of Analyzing Software Data; Quantitative Methods}, DOI={10.1109/icse.2015.306}, abstractNote={Using the tools of quantitative data science, software engineers that can predict useful information on new projects based on past projects. This tutorial reflects on the state-of-the-art in quantitative reasoning in this important field. This tutorial discusses the following: (a) when local data is scarce, we show how to adapt data from other organizations to local problems; (b) when working with data of dubious quality, we show how to prune spurious information; (c) when data or models seem too complex, we show how to simplify data mining results; (d) when the world changes, and old models need to be updated, we show how to handle those updates; (e) when the effect is too complex for one model, we show to how reason over ensembles.}, journal={2015 IEEE/ACM 37th IEEE International Conference on Software Engineering, Vol 2}, author={Menzies, Tim and Minku, Leandro and Peters, Fayola}, year={2015}, pages={959–960} } @article{kocaguneli_menzies_mendes_2015, title={Transfer learning in effort estimation}, volume={20}, ISSN={1382-3256 1573-7616}, url={http://dx.doi.org/10.1007/S10664-014-9300-5}, DOI={10.1007/S10664-014-9300-5}, number={3}, journal={Empirical Software Engineering}, publisher={Springer Science and Business Media LLC}, author={Kocaguneli, Ekrem and Menzies, Tim and Mendes, Emilia}, year={2015}, month={Jun}, pages={813–843} } @article{partington_murphy_bowen_lacombe_piras_cottrell_menzies_2014, title={Choose to Change: The West Virginia Early Childhood Obesity Prevention Project}, volume={46}, ISSN={1499-4046}, url={http://dx.doi.org/10.1016/J.JNEB.2014.04.213}, DOI={10.1016/J.JNEB.2014.04.213}, abstractNote={Choose to Change is a five year study designed to assess contributors to early childhood and design, implement and evaluate home-school- and community-level obesity prevention initiatives. Components: multi-level assessment of behavioral and environmental contributors to childhood obesity, implementation of community-, school-, and home-level interventions, and assessment of intervention efficacy. Study population: 286 children in HeadStart/pre-kindergarten in two West Virginia counties. Change in physical activity and eating behavior in children and families from pre- to post intervention. Characteristics of home and neighborhood environments may be important influences on eating and physical activity behaviors in families of very young children.}, number={4}, journal={Journal of Nutrition Education and Behavior}, publisher={Elsevier BV}, author={Partington, Susan and Murphy, E. and Bowen, E. and Lacombe, D. and Piras, G. and Cottrell, L. and Menzies, T.}, year={2014}, month={Jul}, pages={S197} } @article{menzies_mernik_2014, title={Special issue on realizing artificial intelligence synergies in software engineering}, volume={22}, ISSN={0963-9314 1573-1367}, url={http://dx.doi.org/10.1007/S11219-014-9228-4}, DOI={10.1007/S11219-014-9228-4}, abstractNote={This special issue focuses on issues arising from the RAISE’12 Workshop on Realizing Artificial Intelligence Synergies in Software Engineering. Our objective is to provide a forum for researchers and industrial practitioners to exchange and discuss the latest innovative synergistic AI and SE techniques/practices. Why explore this combination of SE and AI? We think there are many answers to that question. As SE is asked to answer dynamic automated, adaptive, and/or large-scale demands, other computer science disciplines come to play. AI is one of them that may bring SE to further heights. Conversely, SE can also play a role to alleviate development costs and the development effort associated with AI tools. Such mutually beneficial characteristics have appeared in the past few decades and still evolve due to new challenges. That is, this special issue explores not only the application of AI techniques to software engineering problems but also the application of software engineering techniques to AI problems. This special issue is the result of much work that is still ongoing. As to future work, the RAISE series is also on-going. At the time of this writing, the RAISE’13 workshop has just completed (that event was sponsored by the United States National Science Foundation, and we thank them for their generous support). At that meeting, we saw much continued interest in this union of AI and SE. The reader should expect much novel and exciting work from this combination of ideas, in the very near future. And regarding the work seen to date, we hope that all the aforementioned papers will provide readers with some glimpse of the kind of work discussed at RAISE’12. Also, we would like to sincerely thank the reviewers for their assistance in the reviewing process.}, number={1}, journal={Software Quality Journal}, publisher={Springer Science and Business Media LLC}, author={Menzies, Tim and Mernik, Marjan}, year={2014}, month={Feb}, pages={49–50} } @article{partington_murphy_bowen_lacombe_piras_carson_cottrell_menzies_2013, title={Choose to Change: The West Virginia Early Childhood Obesity Prevention Project}, volume={45}, ISSN={1499-4046}, url={http://dx.doi.org/10.1016/J.JNEB.2013.04.271}, DOI={10.1016/J.JNEB.2013.04.271}, abstractNote={Develop and disseminate effective, sustainable, multi-level pediatric obesity prevention strategies. Project Components: 1) multi-level assessment of behavioral and environmental contributors to early childhood obesity, 2) implementation of a community-, school-, and home-level intervention, and 4) assessment of intervention efficacy. Cohort one study population: 151 families with children in HeadStart/pre-kindergarten in two West Virginia counties. Outcomes will be assessed as change in physical activity and eating behavior in children and families from the pre- to post intervention. Pre-intervention assessment indicated 37% of study children were overweight or obese. To be determined based on post-intervention analyses.}, number={4}, journal={Journal of Nutrition Education and Behavior}, publisher={Elsevier BV}, author={Partington, Susan and Murphy, E. and Bowen, E. and Lacombe, D. and Piras, G. and Carson, L. and Cottrell, L. and Menzies, T.}, year={2013}, month={Jul}, pages={S92} } @article{menzies_2013, title={Guest editorial for the Special Section on BEST PAPERS from the 2011 conference on Predictive Models in Software Engineering (PROMISE)}, volume={55}, ISSN={0950-5849}, url={http://dx.doi.org/10.1016/J.INFSOF.2013.03.006}, DOI={10.1016/J.INFSOF.2013.03.006}, abstractNote={The measurement of Function Points is based on Base Functional Components. The process of identifying and weighting Base Functional Components is hardly automatable, due to the informality of both the Function Point method and the requirements documents being measured. So, Function Point measurement generally requires a lengthy and costly process.We investigate whether it is possible to take into account only subsets of Base Functional Components so as to obtain functional size measures that simplify Function Points with the same effort estimation accuracy as the original Function Points measure. Simplifying the definition of Function Points would imply a reduction of measurement costs and may help spread the adoption of this type of measurement practices. Specifically, we empirically investigate the following issues: whether available data provide evidence that simplified software functionality measures can be defined in a way that is consistent with Function Point Analysis; whether simplified functional size measures by themselves can be used without any appreciable loss in software development effort prediction accuracy; whether simplified functional size measures can be used as software development effort predictors in models that also use other software requirements measures.We analyze the relationships between Function Points and their Base Functional Components. We also analyze the relationships between Base Functional Components and development effort. Finally, we built effort prediction models that contain both the simplified functional measures and additional requirements measures.Significant statistical models correlate Function Points with Base Functional Components. Basic Functional Components can be used to build models of effort that are equivalent, in terms of accuracy, to those based on Function Points. Finally, simplified Function Points measures can be used as software development effort predictors in models that also use other requirements measures.The definition and measurement processes of Function Points can be dramatically simplified by taking into account a subset of the Base Functional Components used in the original definition of the measure, thus allowing for substantial savings in measurement effort, without sacrificing the accuracy of software development effort estimates.}, number={8}, journal={Information and Software Technology}, publisher={Elsevier BV}, author={Menzies, Tim}, year={2013}, month={Aug}, pages={1477–1478} } @article{menzies_koru_2013, title={Predictive models in software engineering}, volume={18}, ISSN={1382-3256 1573-7616}, url={http://dx.doi.org/10.1007/S10664-013-9252-1}, DOI={10.1007/S10664-013-9252-1}, abstractNote={Welcome to the Empirical Software Engineering’s special issue on predictive models in software engineering. The goal of such methods is repeatable, refutable (and possibly improvable) results in software engineering. Many of the recent papers in SE literature are based on data from on-line repositories such as http://promisedata.googlecode.com. This introduces a kind of selection in the kinds of papers published at this venue. Our first paper pushes past that bias to explore a very rich time-based data set. In “Predicting the Flow of Defect Correction Effort using a Bayesian Network Model”, Schulz et al. use a Bayes net to explore the effects of removing defects at different stages of the software lifecycle. Their work shows how to calibrate general models to the particulars of a company’s local particulars. Our next paper “The Limited Impact of Individual Developer Data on Software Defect Prediction” by Bell et concludes there is no added value to reasoning on some aspects of social aspects of programmer teams working on a code. This is a timely counterpoint to other research that eschews code measures for other approaches based only on social metrics. Our last paper explores the complicated issue of parameter tuning. In “Using Tabu Search to Configure Support Vector Regression for Effort Estimation”, Corazza et al. offers automated guidance for setting the parameters that control a learner. This is a matter of critical importance since even the best learner can perform poorly if its operator uses the wrong settings. A special issue like this is only possible due to the hard work of a dedicated set of authors are reviewers. We would like to express our gratitude to all authors who submitted their papers this special issue. We would also like to thank our reviewers for their meticulous evaluation of the submissions. The success of special issues such as this one largely stands on their shoulders. Empir Software Eng (2013) 18:433–434 DOI 10.1007/s10664-013-9252-1}, number={3}, journal={Empirical Software Engineering}, publisher={Springer Science and Business Media LLC}, author={Menzies, Tim and Koru, Gunes}, year={2013}, month={Apr}, pages={433–434} } @article{kocaguneli_menzies_2013, title={Software effort models should be assessed via leave-one-out validation}, volume={86}, ISSN={0164-1212}, url={http://dx.doi.org/10.1016/J.JSS.2013.02.053}, DOI={10.1016/J.JSS.2013.02.053}, abstractNote={More than half the literature on software effort estimation (SEE) focuses on model comparisons. Each of those requires a sampling method (SM) to generate the train and test sets. Different authors use different SMs such as leave-one-out (LOO), 3Way and 10Way cross-validation. While LOO is a deterministic algorithm, the N-way methods use random selection to build their train and test sets. This introduces the problem of conclusion instability where different authors rank effort estimators in different ways. To reduce conclusion instability by removing the effects of a sampling method's random test case generation. Calculate bias and variance (B&V) values following the assumption that a learner trained on the whole dataset is taken as the true model; then demonstrate that the B&V and runtime values for LOO are similar to N-way by running 90 different algorithms on 20 different SEE datasets. For each algorithm, collect runtimes, B&V values under LOO, 3Way and 10Way. We observed that: (1) the majority of the algorithms have statistically indistinguishable B&V values under different SMs and (2) different SMs have similar run times. In terms of their generated B&V values and runtimes, there is no reason to prefer N-way over LOO. In terms of reproducibility, LOO removes one cause of conclusion instability (the random selection of train and test sets). Therefore, we depreciate N-way and endorse LOO validation for assessing effort models.}, number={7}, journal={Journal of Systems and Software}, publisher={Elsevier BV}, author={Kocaguneli, Ekrem and Menzies, Tim}, year={2013}, month={Jul}, pages={1879–1890} } @inbook{kim_kang_ryu_compton_han_menzies_2012, title={Crowd-Sourced Knowledge Bases}, ISBN={9783642325403 9783642325410}, ISSN={0302-9743 1611-3349}, url={http://dx.doi.org/10.1007/978-3-642-32541-0_23}, DOI={10.1007/978-3-642-32541-0_23}, abstractNote={Crowdsourcing is a low cost way of obtaining human judgements on a large number of items, but the knowledge in these judgements is not reusable and further items to be processed require further human judgement. Ideally one could also obtain the reasons people have for these judgements, so the ability to make the same judgements could be incorporated into a crowd-sourced knowledge base. This paper reports on experiments with 27 students building knowledge bases to classify the same set of 1000 documents. We have assessed the performance of the students building the knowledge bases using the same students to assess the performance of each other’s knowledge bases on a set of test documents. We have explored simple techniques for combining the knowledge from the students. These results suggest that although people vary in document classification, simple merging may produce reasonable consensus knowledge bases.}, booktitle={Knowledge Management and Acquisition for Intelligent Systems}, publisher={Springer Berlin Heidelberg}, author={Kim, Yang Sok and Kang, Byeong Ho and Ryu, Seung Hwan and Compton, Paul and Han, Soyeon Caren and Menzies, Tim}, year={2012}, pages={258–271} } @article{keung_kocaguneli_menzies_2012, title={Finding conclusion stability for selecting the best effort predictor in software effort estimation}, volume={20}, ISSN={0928-8910 1573-7535}, url={http://dx.doi.org/10.1007/S10515-012-0108-5}, DOI={10.1007/S10515-012-0108-5}, number={4}, journal={Automated Software Engineering}, publisher={Springer Science and Business Media LLC}, author={Keung, Jacky and Kocaguneli, Ekrem and Menzies, Tim}, year={2012}, month={May}, pages={543–567} } @article{menzies_shepperd_2012, title={Special issue on repeatable results in software engineering prediction}, volume={17}, ISSN={1382-3256 1573-7616}, url={http://dx.doi.org/10.1007/S10664-011-9193-5}, DOI={10.1007/S10664-011-9193-5}, abstractNote={The goal of science is conclusion stability, i.e. to discover some effect X that holds in multiple situations. Sadly, there are all too few examples of stable conclusions in software engineering (SE). In fact, the typical result is conclusion instability where what is true for project one, does not hold for project two. We can find numerous studies of the following form: there is as much evidence for as against the argument that some aspect X adds value to a software project. Below are four examples of this type of problem which we believe to be endemic within SE.}, number={1-2}, journal={Empirical Software Engineering}, publisher={Springer Science and Business Media LLC}, author={Menzies, Tim and Shepperd, Martin}, year={2012}, month={Jan}, pages={1–17} } @article{bener_menzies_2011, title={Guest editorial: learning to organize testing}, volume={19}, ISSN={0928-8910 1573-7535}, url={http://dx.doi.org/10.1007/S10515-011-0095-Y}, DOI={10.1007/S10515-011-0095-Y}, abstractNote={At the start of the decade, two publications (Shull et al. 2002; Boehm and Basili 2001) described the start-of-the art in defect reduction. Since then, there has been considerable research into data mining of defect data; e.g. Menzies et al. (2007). The data mining work has become less about defect reduction, and more about how to organize a project’s test resources in order to improve product quality by (say) defining a procedure such that the modules most likely to contain defects are inspected first (Menzies et al. 2010). After a decade of intensive work into data mining to make best use of testing resources, it is time to ask: what have we learned from all that research? Some of that research offers success stories with (e.g.) • Reducing the costs to find defects (Menzies et al. 2010); • Generalizing defect predictors to other projects (Tosun et al. 2011); • Tuning those predictors to different business goals (Turhan et al. 2009).}, number={2}, journal={Automated Software Engineering}, publisher={Springer Science and Business Media LLC}, author={Bener, Ayse and Menzies, Tim}, year={2011}, month={Oct}, pages={137–140} } @article{kocaguneli_menzies_keung_2011, title={Kernel methods for software effort estimation}, volume={18}, ISSN={1382-3256 1573-7616}, url={http://dx.doi.org/10.1007/S10664-011-9189-1}, DOI={10.1007/S10664-011-9189-1}, number={1}, journal={Empirical Software Engineering}, publisher={Springer Science and Business Media LLC}, author={Kocaguneli, Ekrem and Menzies, Tim and Keung, Jacky W.}, year={2011}, month={Dec}, pages={1–24} } @article{nandeshwar_menzies_nelson_2011, title={Learning patterns of university student retention}, volume={38}, ISSN={0957-4174}, url={http://dx.doi.org/10.1016/j.eswa.2011.05.048}, DOI={10.1016/j.eswa.2011.05.048}, abstractNote={Learning predictors for student retention is very difficult. After reviewing the literature, it is evident that there is considerable room for improvement in the current state of the art. As shown in this paper, improvements are possible if we (a) explore a wide range of learning methods; (b) take care when selecting attributes; (c) assess the efficacy of the learned theory not just by its median performance, but also by the variance in that performance; (d) study the delta of student factors between those who stay and those who are retained. Using these techniques, for the goal of predicting if students will remain for the first three years of an undergraduate degree, the following factors were found to be informative: family background and family’s social-economic status, high school GPA and test scores.}, number={12}, journal={Expert Systems with Applications}, publisher={Elsevier BV}, author={Nandeshwar, Ashutosh and Menzies, Tim and Nelson, Adam}, year={2011}, month={Nov}, pages={14984–14996} } @article{el-rawas_menzies_2010, title={A second look at Faster, Better, Cheaper}, volume={6}, ISSN={1614-5046 1614-5054}, url={http://dx.doi.org/10.1007/S11334-010-0137-9}, DOI={10.1007/S11334-010-0137-9}, number={4}, journal={Innovations in Systems and Software Engineering}, publisher={Springer Science and Business Media LLC}, author={El-Rawas, Oussama and Menzies, Tim}, year={2010}, month={Oct}, pages={319–335} } @article{gay_menzies_davies_gundy-burlet_2010, title={Automatically finding the control variables for complex system behavior}, volume={17}, ISSN={0928-8910 1573-7535}, url={http://dx.doi.org/10.1007/s10515-010-0072-x}, DOI={10.1007/s10515-010-0072-x}, abstractNote={Testing large-scale systems is expensive in terms of both time and money. Running simulations early in the process is a proven method of finding the design faults likely to lead to critical system failures, but determining the exact cause of those errors is still time-consuming and requires access to a limited number of domain experts. It is desirable to find an automated method that explores the large number of combinations and is able to isolate likely fault points. Treatment learning is a subset of minimal contrast-set learning that, rather than classifying data into distinct categories, focuses on finding the unique factors that lead to a particular classification. That is, they find the smallest change to the data that causes the largest change in the class distribution. These treatments, when imposed, are able to identify the factors most likely to cause a mission-critical failure. The goal of this research is to comparatively assess treatment learning against state-of-the-art numerical optimization techniques. To achieve this, this paper benchmarks the TAR3 and TAR4.1 treatment learners against optimization techniques across three complex systems, including two projects from the Robust Software Engineering (RSE) group within the National Aeronautics and Space Administration (NASA) Ames Research Center. The results clearly show that treatment learning is both faster and more accurate than traditional optimization methods.}, number={4}, journal={Automated Software Engineering}, publisher={Springer Science and Business Media LLC}, author={Gay, Gregory and Menzies, Tim and Davies, Misty and Gundy-Burlet, Karen}, year={2010}, month={May}, pages={439–468} } @article{menzies_milton_turhan_cukic_jiang_bener_2010, title={Defect prediction from static code features: current results, limitations, new approaches}, volume={17}, ISSN={0928-8910 1573-7535}, url={http://dx.doi.org/10.1007/s10515-010-0069-5}, DOI={10.1007/s10515-010-0069-5}, number={4}, journal={Automated Software Engineering}, publisher={Springer Science and Business Media LLC}, author={Menzies, Tim and Milton, Zach and Turhan, Burak and Cukic, Bojan and Jiang, Yue and Bener, Ayşe}, year={2010}, month={May}, pages={375–407} } @article{tosun_bener_turhan_menzies_2010, title={Practical considerations in deploying statistical methods for defect prediction: A case study within the Turkish telecommunications industry}, volume={52}, ISSN={0950-5849}, url={http://dx.doi.org/10.1016/j.infsof.2010.06.006}, DOI={10.1016/j.infsof.2010.06.006}, abstractNote={Building defect prediction models in large organizations has many challenges due to limited resources and tight schedules in the software development lifecycle. It is not easy to collect data, utilize any type of algorithm and build a permanent model at once. We have conducted a study in a large telecommunications company in Turkey to employ a software measurement program and to predict pre-release defects. Based on our prior publication, we have shared our experience in terms of the project steps (i.e. challenges and opportunities). We have further introduced new techniques that improve our earlier results. In our previous work, we have built similar predictors using data representative for US software development. Our task here was to check if those predictors were specific solely to US organizations or to a broader class of software. We have presented our approach and results in the form of an experience report. Specifically, we have made use of different techniques for improving the information content of the software data and the performance of a Naïve Bayes classifier in the prediction model that is locally tuned for the company. We have increased the information content of the software data by using module dependency data and improved the performance by adjusting the hyper-parameter (decision threshold) of the Naïve Bayes classifier. We have reported and discussed our results in terms of defect detection rates and false alarms. We also carried out a cost–benefit analysis to show that our approach can be efficiently put into practice. Our general result is that general defect predictors, which exist across a wide range of software (in both US and Turkish organizations), are present. Our specific results indicate that concerning the organization subject to this study, the use of version history information along with code metrics decreased false alarms by 22%, the use of dependencies between modules further reduced false alarms by 8%, and the decision threshold optimization for the Naïve Bayes classifier using code metrics and version history information further improved false alarms by 30% in comparison to a prediction using only code metrics and a default decision threshold. Implementing statistical techniques and machine learning on a real life scenario is a difficult yet possible task. Using simple statistical and algorithmic techniques produces an average detection rate of 88%. Although using dependency data improves our results, it is difficult to collect and analyze such data in general. Therefore, we would recommend optimizing the hyper-parameter of the proposed technique, Naïve Bayes, to calibrate the defect prediction model rather than employing more complex classifiers. We also recommend that researchers who explore statistical and algorithmic methods for defect prediction should spend less time on their algorithms and more time on studying the pragmatic considerations of large organizations.}, number={11}, journal={Information and Software Technology}, publisher={Elsevier BV}, author={Tosun, Ayşe and Bener, Ayşe and Turhan, Burak and Menzies, Tim}, year={2010}, month={Nov}, pages={1242–1257} } @inbook{turhan_bener_menzies_2010, title={Regularities in Learning Defect Predictors}, ISBN={9783642137914 9783642137921}, ISSN={0302-9743 1611-3349}, url={http://dx.doi.org/10.1007/978-3-642-13792-1_11}, DOI={10.1007/978-3-642-13792-1_11}, booktitle={Product-Focused Software Process Improvement}, publisher={Springer Berlin Heidelberg}, author={Turhan, Burak and Bener, Ayse and Menzies, Tim}, year={2010}, pages={116–130} } @article{nelson_menzies_gay_2010, title={Sharing experiments using open-source software}, volume={41}, ISSN={0038-0644}, url={http://dx.doi.org/10.1002/spe.1004}, DOI={10.1002/spe.1004}, abstractNote={Abstract}, number={3}, journal={Software: Practice and Experience}, publisher={Wiley}, author={Nelson, Adam and Menzies, Tim and Gay, Gregory}, year={2010}, month={Sep}, pages={283–305} } @article{menzies_jalali_hihn_baker_lum_2010, title={Stable rankings for different effort models}, volume={17}, ISSN={0928-8910 1573-7535}, url={http://dx.doi.org/10.1007/s10515-010-0070-z}, DOI={10.1007/s10515-010-0070-z}, number={4}, journal={Automated Software Engineering}, publisher={Springer Science and Business Media LLC}, author={Menzies, Tim and Jalali, Omid and Hihn, Jairus and Baker, Dan and Lum, Karen}, year={2010}, month={May}, pages={409–437} } @article{menzies_williams_elrawas_baker_boehm_hihn_lum_madachy_2009, title={Accurate estimates without local data?}, volume={14}, ISSN={1077-4866 1099-1670}, url={http://dx.doi.org/10.1002/spip.414}, DOI={10.1002/spip.414}, abstractNote={Abstract}, number={4}, journal={Software Process: Improvement and Practice}, publisher={Wiley}, author={Menzies, Tim and Williams, Steve and Elrawas, Oussama and Baker, Daniel and Boehm, Barry and Hihn, Jairus and Lum, Karen and Madachy, Ray}, year={2009}, month={Jul}, pages={213–225} } @article{gay_menzies_jalali_mundy_gilkerson_feather_kiper_2009, title={Finding robust solutions in requirements models}, volume={17}, ISSN={0928-8910 1573-7535}, url={http://dx.doi.org/10.1007/s10515-009-0059-7}, DOI={10.1007/s10515-009-0059-7}, abstractNote={Solutions to non-linear requirements engineering problems may be “brittle”; i.e. small changes may dramatically alter solution effectiveness. Hence, it is not enough to just generate solutions to requirements problems- we must also assess solution robustness. The KEYS2 algorithm can generate decision ordering diagrams. Once generated, these diagrams can assess solution robustness in linear time. In experiments with real-world requirements engineering models, we show that KEYS2 can generate decision ordering diagrams in O(N 2). When assessed in terms of terms of (a) reducing inference times, (b) increasing solution quality, and (c) decreasing the variance of the generated solution, KEYS2 out-performs other search algorithms (simulated annealing, ASTAR, MaxWalkSat).}, number={1}, journal={Automated Software Engineering}, publisher={Springer Science and Business Media LLC}, author={Gay, Gregory and Menzies, Tim and Jalali, Omid and Mundy, Gregory and Gilkerson, Beau and Feather, Martin and Kiper, James}, year={2009}, month={Dec}, pages={87–116} } @inbook{orrego_menzies_el-rawas_2009, title={On the Relative Merits of Software Reuse}, ISBN={9783642016790 9783642016806}, ISSN={0302-9743 1611-3349}, url={http://dx.doi.org/10.1007/978-3-642-01680-6_18}, DOI={10.1007/978-3-642-01680-6_18}, abstractNote={Using process simulation and AI search methods, we compare software reuse against other possible changes to a project. such as reducing functionality or improving the skills of the programmer population. In one case, two generations of reuse were as good or better than any other project change (but a third and fourth generation of reuse was not useful). In another case, applying reuse to a project was demonstrable worse than several other possible changes to a project. Our conclusion is that the general claims regarding the benefits of software reuse do not hold for specific projects. We argue that the merits of software reuse need to be evaluated in a project by project basis. AI search over process models is useful for such an assessment, particularly when there is not sufficient data for precisely tuning a simulation model.}, booktitle={Trustworthy Software Development Processes}, publisher={Springer Berlin Heidelberg}, author={Orrego, Andres and Menzies, Tim and El-Rawas, Oussama}, year={2009}, pages={186–197} } @article{turhan_menzies_bener_di stefano_2009, title={On the relative value of cross-company and within-company data for defect prediction}, volume={14}, ISSN={1382-3256 1573-7616}, url={http://dx.doi.org/10.1007/s10664-008-9103-7}, DOI={10.1007/s10664-008-9103-7}, abstractNote={We propose a practical defect prediction approach for companies that do not track defect related data. Specifically, we investigate the applicability of cross-company (CC) data for building localized defect predictors using static code features. Firstly, we analyze the conditions, where CC data can be used as is. These conditions turn out to be quite few. Then we apply principles of analogy-based learning (i.e. nearest neighbor (NN) filtering) to CC data, in order to fine tune these models for localization. We compare the performance of these models with that of defect predictors learned from within-company (WC) data. As expected, we observe that defect predictors learned from WC data outperform the ones learned from CC data. However, our analyses also yield defect predictors learned from NN-filtered CC data, with performance close to, but still not better than, WC data. Therefore, we perform a final analysis for determining the minimum number of local defect reports in order to learn WC defect predictors. We demonstrate in this paper that the minimum number of data samples required to build effective defect predictors can be quite small and can be collected quickly within a few months. Hence, for companies with no local defect data, we recommend a two-phase approach that allows them to employ the defect prediction process instantaneously. In phase one, companies should use NN-filtered CC data to initiate the defect prediction process and simultaneously start collecting WC (local) data. Once enough WC data is collected (i.e. after a few months), organizations should switch to phase two and use predictors learned from WC data.}, number={5}, journal={Empirical Software Engineering}, publisher={Springer Science and Business Media LLC}, author={Turhan, Burak and Menzies, Tim and Bener, Ayşe B. and Di Stefano, Justin}, year={2009}, month={Jan}, pages={540–578} } @inbook{menzies_elrawas_boehm_madachy_hihn_baker_lum_2008, title={Accurate Estimates without Calibration?}, ISBN={9783540795872 9783540795889}, url={http://dx.doi.org/10.1007/978-3-540-79588-9_19}, DOI={10.1007/978-3-540-79588-9_19}, booktitle={Making Globally Distributed Software Development a Success Story}, publisher={Springer Berlin Heidelberg}, author={Menzies, Tim and Elrawas, Oussama and Boehm, Barry and Madachy, Raymond and Hihn, Jairus and Baker, Daniel and Lum, Karen}, year={2008}, month={May}, pages={210–221} } @article{menzies_2008, title={Editorial, special issue, repeatable experiments in software engineering}, volume={13}, ISSN={1382-3256 1573-7616}, url={http://dx.doi.org/10.1007/s10664-008-9087-3}, DOI={10.1007/s10664-008-9087-3}, number={5}, journal={Empirical Software Engineering}, publisher={Springer Science and Business Media LLC}, author={Menzies, Tim}, year={2008}, month={Sep}, pages={469–471} } @article{menzies_benson_costello_moats_northey_richardson_2008, title={Learning better IV&V practices}, volume={4}, ISSN={1614-5046 1614-5054}, url={http://dx.doi.org/10.1007/S11334-008-0046-3}, DOI={10.1007/S11334-008-0046-3}, number={2}, journal={Innovations in Systems and Software Engineering}, publisher={Springer Science and Business Media LLC}, author={Menzies, Tim and Benson, Markland and Costello, Ken and Moats, Christina and Northey, Melissa and Richardson, Julian}, year={2008}, month={Feb}, pages={169–183} } @article{etzkorn_menzies_2008, title={Special issue on information retrieval for program comprehension}, volume={14}, ISSN={1382-3256 1573-7616}, url={http://dx.doi.org/10.1007/s10664-008-9097-1}, DOI={10.1007/s10664-008-9097-1}, abstractNote={Welcome to the special issue on information retrieval for program comprehension (IR4PC). IR4PC employs various interdisciplinary information search techniques to examine the properties of both existing (legacy) and newly created software. IR4PC is important for software reuse, software maintenance and evolution, and reverse engineering, just to mention a few areas. Back in the 1980s and early 1990s, much program comprehension involved representing program code as control and data flow graphs. Recognizing program constructs was performed by comparing flow graphs to a plan library of known constructs (e.g. Rich and Waters 1989). However, formal non-heuristic approaches to program comprehension have been shown to be NP-hard and their success was often illustrated only in toy domains (Woods and Yang 1996). For this reason, heuristic approaches acquired new importance. In the 21st century, much program comprehension research has focused on applying various information retrieval techniques (e.g. text mining, LSI, knowledge-based NL understanding) to software. These new IR4PC semantic measures examine informal information in the tokens within the software itself (e.g. identifier names, function names and variable names, code comments) as well as the natural language content in external documentation such as software requirements documents or software design documents. In the past, IR4PC techniques have been successfully applied to (among other areas) static concept location (using information derived from informal tokens together with structural information such as call graphs to locate code sections that are related to given concepts), to determining whether a particular software component is reusable, to dynamic search or software reconnaissance (examining informal tokens along execution traces of program executed with and without a particular feature), to developer identification (determining which developer is the best one to perform a particular task), to bug location Empir Software Eng DOI 10.1007/s10664-008-9097-1}, number={1}, journal={Empirical Software Engineering}, publisher={Springer Science and Business Media LLC}, author={Etzkorn, Letha and Menzies, Tim}, year={2008}, month={Oct}, pages={1–4} } @article{menzies_hu_2006, title={Just enough learning (of association rules): the TAR2 “Treatment” learner}, volume={25}, ISSN={0269-2821 1573-7462}, url={http://dx.doi.org/10.1007/s10462-007-9055-0}, DOI={10.1007/s10462-007-9055-0}, number={3}, journal={Artificial Intelligence Review}, publisher={Springer Science and Business Media LLC}, author={Menzies, Tim and Hu, Ying}, year={2006}, month={May}, pages={211–229} } @inproceedings{menzies_2004, title={Mining repositories to assist in project planning and resource allocation}, ISBN={086341432X}, url={http://dx.doi.org/10.1049/IC:20040480}, DOI={10.1049/IC:20040480}, abstractNote={Abstract Software repositories plus defect logs are useful for learningdefect detectors. Such defect detectors could be a useful resourceallocation tool for software managers. One way to view our detec-tors is that they are a VV i.e. they can be used toassess if ”too much” of the testing budget is going to ”too little”of the system. Finding such detectors could become the businesscase that constructing a local repository is useful.Three counter arguments to such a proposal are (1) no gen-eral conclusions have been reported in any such repository despiteyears of effort; (2) if such general conclusions existed then therewould be no need to build a local repository; (3) no such generalconclusions will ever exist, according to many researchers. Thisarticle is a reply to these three arguments.To appear in the International Workshop on Mining SoftwareRepositories (co-located with ICSE 2004) May 2004;http://msr.uwaterloo.ca. 1 Introduction To make the most of finite resources, test engineers typicallyuse their own expertise to separate critical from non-critical soft-ware components. The critical components are then allocatedmore of the testing budget than the rest of the system. A con-cern with this approach is that the wrong parts of the system mightget the lions-share of the testing resource.Defect detectors based on static code measures of componentsin repositories are a fast way of surveying the supposedly non-mission-critical sections. Such detectors can be a VV i.e. they can be used to assess if too much of the testingbudget is going to too little of the system. As shown below, sat-isfactory detectors can be learnt from simple static code measuresbased on the Halstead [2] and Mccabes [3] features}, booktitle={"International Workshop on Mining Software Repositories (MSR 2004)" W17S Workshop - 26th International Conference on Software Engineering}, publisher={IEE}, author={Menzies, T.}, year={2004} }