@article{chen_fu_krishna_menzies_2018, title={Applications of Psychological Science for Actionable Analytics}, DOI={10.1145/3236024.3236050}, abstractNote={According to psychological scientists, humans understand models that most match their own internal models, which they characterize as lists of "heuristic"s (i.e. lists of very succinct rules). One such heuristic rule generator is the Fast-and-Frugal Trees (FFT) preferred by psychological scientists. Despite their successful use in many applied domains, FFTs have not been applied in software analytics. Accordingly, this paper assesses FFTs for software analytics. We find that FFTs are remarkably effective in that their models are very succinct (5 lines or less describing a binary decision tree) while also outperforming result from very recent, top-level, conference papers. Also, when we restrict training data to operational attributes (i.e., those attributes that are frequently changed by developers), the performance of FFTs are not effected (while the performance of other learners can vary wildly). Our conclusions are two-fold. Firstly, there is much that software analytics community could learn from psychological science. Secondly, proponents of complex methods should always baseline those methods against simpler alternatives. For example, FFTs could be used as a standard baseline learner against which other software analytics tools are compared.}, journal={ESEC/FSE'18: PROCEEDINGS OF THE 2018 26TH ACM JOINT MEETING ON EUROPEAN SOFTWARE ENGINEERING CONFERENCE AND SYMPOSIUM ON THE FOUNDATIONS OF SOFTWARE ENGINEERING}, author={Chen, Di and Fu, Wei and Krishna, Rahul and Menzies, Tim}, year={2018}, pages={456–467} } @article{chen_lu_lin_2005, title={Asymptotic distributions of semiparametric maximum likelihood estimators with estimating equations for group-censored data}, volume={47}, ISSN={["1467-842X"]}, DOI={10.1111/j.1467-842X.2005.00382.x}, abstractNote={Semiparametric maximum likelihood estimation with estimating equations (SMLE) is more flexible than traditional methods; it has fewer restrictions on distributions and regression models. The required information about distribution and regression structures is incorporated in estimating equations of the SMLE to improve the estimation quality of non‐parametric methods. The likelihood of SMLE for censored data involves complicated implicit functions without closed‐form expressions, and the first derivatives of the log‐profile‐likelihood cannot be expressed as summations of independent and identically distributed random variables; it is challenging to derive asymptotic properties of the SMLE for censored data. For group‐censored data, the paper shows that all the implicit functions are well defined and obtains the asymptotic distributions of the SMLE for model parameters and lifetime distributions. With several examples the paper compares the SMLE, the regular non‐parametric likelihood estimation method and the parametric MLEs in terms of their asymptotic efficiencies, and illustrates application of SMLE. Various asymptotic distributions of the likelihood ratio statistics are derived for testing the adequacy of estimating equations and a partial set of parameters equal to some known values.}, number={2}, journal={AUSTRALIAN & NEW ZEALAND JOURNAL OF STATISTICS}, author={Chen, D and Lu, JC and Lin, SC}, year={2005}, month={Jun}, pages={173–192} } @article{lu_chen_gan_2002, title={Semi-parametric modelling and likelihood estimation with estimating equations}, volume={44}, ISSN={["1369-1473"]}, DOI={10.1111/1467-842X.00222}, abstractNote={This paper proposes a semi‐parametric modelling and estimating method for analysing censored survival data. The proposed method uses the empirical likelihood function to describe the information in data, and formulates estimating equations to incorporate knowledge of the underlying distribution and regression structure. The method is more flexible than the traditional methods such as the parametric maximum likelihood estimation (MLE), Cox’s (1972) proportional hazards model, accelerated life test model, quasi‐likelihood (Wedderburn, 1974) and generalized estimating equations (Liang & Zeger, 1986). This paper shows the existence and uniqueness of the proposed semi‐parametric maximum likelihood estimates (SMLE) with estimating equations. The method is validated with known cases studied in the literature. Several finite sample simulation and large sample efficiency studies indicate that when the sample size is larger than 100 the SMLE is compatible with the parametric MLE; and in all case studies, the SMLE is about 15% better than the parametric MLE with a mis‐specified underlying distribution.}, number={2}, journal={AUSTRALIAN & NEW ZEALAND JOURNAL OF STATISTICS}, author={Lu, JC and Chen, D and Gan, NC}, year={2002}, month={Jun}, pages={193–212} } @article{chen_lu_huo_yin_2001, title={Optimum percentile estimating equations for nonlinear random coefficient models}, volume={97}, ISSN={["0378-3758"]}, DOI={10.1016/S0378-3758(00)00219-6}, abstractNote={In nonlinear random coefficients models, the means or variances of response variables may not exist. In such cases, commonly used estimation procedures, e.g., (extended) least-squares (LS) and quasi-likelihood methods, are not applicable. This article solves this problem by proposing an estimate based on percentile estimating equations (PEE). This method does not require full distribution assumptions and leads to efficient estimates within the class of unbiased estimating equations. By minimizing the asymptotic variance of the PEE estimates, the optimum percentile estimating equations (OPEE) are derived. Several examples including Weibull regression show the flexibility of the PEE estimates. Under certain regularity conditions, the PEE estimates are shown to be strongly consistent and asymptotic normal, and the OPEE estimates have the minimal asymptotic variance. Compared with the parametric maximum likelihood estimates (MLE), the asymptotic efficiency of the OPEE estimates is more than 98%, while the LS-type of procedures can have infinite variances. When the observations have outliers or do not follow the distributions considered in model assumptions, the article shows that OPEE is more robust than the MLE, and the asymptotic efficiency in the model misspecification cases can be above 150%.}, number={2}, journal={JOURNAL OF STATISTICAL PLANNING AND INFERENCE}, author={Chen, D and Lu, JC and Huo, XM and Yin, M}, year={2001}, month={Sep}, pages={275–292} } @article{lu_holton_fenner_williams_kim_hartford_chen_roze_littlejohn_1998, title={A new device design methodology for manufacturability}, volume={45}, ISSN={0018-9383}, url={http://dx.doi.org/10.1109/16.661225}, DOI={10.1109/16.661225}, abstractNote={As future technology generations for integrated circuits continue to "shrink", TCAD tools must be made more central to manufacturing issues; thus, yield optimization and design for manufacturing (DFM) should be addressed integrally with performance and reliability when using TCAD during the initial product design. This paper defines the goals for DFM in TCAD simulations and outlines a formal procedure for achieving an optimized result (ODFM). New design of experiments (DOE), weighted least squares modeling and multiple-objective mean-variance optimization methods are developed as significant parts of the new ODFM procedure. Examples of designing a 0.18-/spl mu/m MOSFET device are given to show the impact of device design procedures on device performance distributions and sensitivity variance profiles.}, number={3}, journal={IEEE Transactions on Electron Devices}, publisher={Institute of Electrical and Electronics Engineers (IEEE)}, author={Lu, J.-C. and Holton, W.C. and Fenner, J.S. and Williams, S.C. and Kim, K.W. and Hartford, A.H. and Chen, D. and Roze, K. and Littlejohn, M.A.}, year={1998}, month={Mar}, pages={634–642} } @article{chen_lu_hughes-oliver_li_1998, title={Asymptotic properties of maximum likelihood estimates for a bivariate exponential distribution and mixed censored data}, volume={48}, ISSN={["0026-1335"]}, DOI={10.1007/s001840050003}, number={2}, journal={METRIKA}, author={Chen, D and Lu, JC and Hughes-Oliver, JM and Li, CS}, year={1998}, pages={109–125} } @article{hughes-oliver_gonzalez-farias_lu_chen_1998, title={Parametric nonstationary correlation models}, volume={40}, ISSN={["0167-7152"]}, DOI={10.1016/S0167-7152(98)00103-5}, abstractNote={Stochastic processes observed over space often exhibit nonstationarity. Possible causes of nonstationarity include mean drift, heterogeneity of responses, or a correlation pattern that is not simply a function of the Euclidean distance between two spatial locations. This paper considers the latter. The need for nonstationary correlation models has been demonstrated in several application areas, including environmental monitoring of pollutants, and modeling of semiconductor fabrication processes. We present parametric nonstationary correlation models for capturing the effect of point sources. For example, if the response variable is carbon monoxide, then a smoke stack producing carbon monoxide would be considered a point source, and it is unreasonable to believe that correlation would not depend on proximity to the smoke stack. Our parametric models allow the consideration of multiple-point sources, as well as testing the strength of a particular source. These models have the usual anisotropic and isotropic exponential correlation functions as special cases.}, number={3}, journal={STATISTICS & PROBABILITY LETTERS}, author={Hughes-Oliver, JM and Gonzalez-Farias, G and Lu, JC and Chen, D}, year={1998}, month={Oct}, pages={267–278} }