@article{ash_hughes-oliver_2022, title={Confidence bands and hypothesis tests for hit enrichment curves}, volume={14}, ISSN={["1758-2946"]}, DOI={10.1186/s13321-022-00629-0}, abstractNote={Abstract In virtual screening for drug discovery, hit enrichment curves are widely used to assess the performance of ranking algorithms with regard to their ability to identify early enrichment. Unfortunately, researchers almost never consider the uncertainty associated with estimating such curves before declaring differences between performance of competing algorithms. Uncertainty is often large because the testing fractions of interest to researchers are small. Appropriate inference is complicated by two sources of correlation that are often overlooked: correlation across different testing fractions within a single algorithm, and correlation between competing algorithms. Additionally, researchers are often interested in making comparisons along the entire curve, not only at a few testing fractions. We develop inferential procedures to address both the needs of those interested in a few testing fractions, as well as those interested in the entire curve. For the former, four hypothesis testing and (pointwise) confidence intervals are investigated, and a newly developed EmProc approach is found to be most effective. For inference along entire curves, EmProc-based confidence bands are recommended for simultaneous coverage and minimal width. While we focus on the hit enrichment curve, this work is also appropriate for lift curves that are used throughout the machine learning community. Our inferential procedures trivially extend to enrichment factors, as well.}, number={1}, journal={JOURNAL OF CHEMINFORMATICS}, author={Ash, Jeremy R. and Hughes-Oliver, Jacqueline M.}, year={2022}, month={Jul} } @article{hughes-oliver_xu_baynes_2018, title={Skin Permeation of Solutes from Metalworking Fluids to Build Prediction Models and Test A Partition Theory}, volume={23}, ISSN={["1420-3049"]}, DOI={10.3390/molecules23123076}, abstractNote={Permeation of chemical solutes through skin can create major health issues. Using the membrane-coated fiber (MCF) as a solid phase membrane extraction (SPME) approach to simulate skin permeation, we obtained partition coefficients for 37 solutes under 90 treatment combinations that could broadly represent formulations that could be associated with occupational skin exposure. These formulations were designed to mimic fluids in the metalworking process, and they are defined in this manuscript using: one of mineral oil, polyethylene glycol-200, soluble oil, synthetic oil, or semi-synthetic oil; at a concentration of 0.05 or 0.5 or 5 percent; with solute concentration of 0.01, 0.05, 0.1, 0.5, 1, or 5 ppm. A single linear free-energy relationship (LFER) model was shown to be inadequate, but extensions that account for experimental conditions provide important improvements in estimating solute partitioning from selected formulations into the MCF. The benefit of the Expanded Nested-Solute-Concentration LFER model over the Expanded Crossed-Factors LFER model is only revealed through a careful leave-one-solute-out cross-validation that properly addresses the existence of replicates to avoid an overly optimistic view of predictive power. Finally, the partition theory that accompanies the MCF approach is thoroughly tested and found to not be supported under complex experimental settings that mimic occupational exposure in the metalworking industry.}, number={12}, journal={MOLECULES}, author={Hughes-Oliver, Jacqueline M. and Xu, Guangning and Baynes, Ronald E.}, year={2018}, month={Dec} } @article{ash_hughes-oliver_2018, title={chemmodlab: a cheminformatics modeling laboratoryR package for fitting and assessing machine learning models}, volume={10}, ISSN={["1758-2946"]}, DOI={10.1186/s13321-018-0309-4}, abstractNote={The goal of chemmodlab is to streamline the fitting and assessment pipeline for many machine learning models in R, making it easy for researchers to compare the utility of these models. While focused on implementing methods for model fitting and assessment that have been accepted by experts in the cheminformatics field, all of the methods in chemmodlab have broad utility for the machine learning community. chemmodlab contains several assessment utilities, including a plotting function that constructs accumulation curves and a function that computes many performance measures. The most novel feature of chemmodlab is the ease with which statistically significant performance differences for many machine learning models is presented by means of the multiple comparisons similarity plot. Differences are assessed using repeated k-fold cross validation, where blocking increases precision and multiplicity adjustments are applied. chemmodlab is freely available on CRAN at https://cran.r-project.org/web/packages/chemmodlab/index.html .}, journal={JOURNAL OF CHEMINFORMATICS}, author={Ash, Jeremy R. and Hughes-Oliver, Jacqueline M.}, year={2018}, month={Nov} } @article{hughes-oliver_2017, title={Mentoring to achieve diversity in graduate programs}, volume={71}, DOI={10.1080/00031305.2016.1255661}, abstractNote={ABSTRACT The discipline of statistics has a celebrated, diverse, and colorful past. With a definite international flavor, we continue to make great strides in keeping our discipline relevant and accessible for addressing significant societal concerns. Unfortunately, we lag behind many other disciplines when it comes to fully tapping into the potential of all demographic groups within the United States. Mentoring provides one of many opportunities to change this narrative. This article looks at hard numbers related to diversity, points to some existing successful mentoring programs, and is a reflection of lessons learned through personal experiences.}, number={1}, journal={American Statistician}, author={Hughes-Oliver, J. M.}, year={2017}, pages={55–60} } @article{zhang_hughes-oliver_young_2013, title={Analysis of High-Dimensional Structure-Activity Screening Datasets Using the Optimal Bit String Tree}, volume={55}, ISSN={0040-1706 1537-2723}, url={http://dx.doi.org/10.1080/00401706.2012.760489}, DOI={10.1080/00401706.2012.760489}, abstractNote={We propose a new classification method called the Optimal Bit String Tree (OBSTree) to identify quantitative structure-activity relationships (QSARs). The method introduces the concept of a chromosome to describe the presence/absence context of a combination of descriptors. A descriptor set and its optimal chromosome form the splitting variable. A new stochastic searching scheme that contains a weighted sampling scheme, simulated annealing, and a trimming procedure optimizes the choice of splitting variable. Simulation studies and an application to screening monoamine oxidase inhibitors show that OBSTree is advantageous in accurately and effectively identifying QSAR rules and finding different classes of active compounds. Details of the algorithm, SAS code, and simulated and real datasets are available online as supplementary materials.}, number={2}, journal={Technometrics}, publisher={Informa UK Limited}, author={Zhang, Ke and Hughes-Oliver, Jacqueline M. and Young, S. Stanley}, year={2013}, month={May}, pages={161–173} } @article{xu_hughes-oliver_brooks_baynes_2013, title={Predicting skin permeability from complex chemical mixtures: incorporation of an expanded QSAR model}, volume={24}, ISSN={1062-936X 1029-046X}, url={http://dx.doi.org/10.1080/1062936X.2013.792875}, DOI={10.1080/1062936X.2013.792875}, abstractNote={Quantitative structure–activity relationship (QSAR) models have been widely used to study the permeability of chemicals or solutes through skin. Among the various QSAR models, Abraham’s linear free-energy relationship (LFER) model is often employed. However, when the experimental conditions are complex, it is not always appropriate to use Abraham’s LFER model with a single set of regression coefficients. In this paper, we propose an expanded model in which one set of partial slopes is defined for each experimental condition, where conditions are defined according to solvent: water, synthetic oil, semi-synthetic oil, or soluble oil. This model not only accounts for experimental conditions but also improves the ability to conduct rigorous hypothesis testing. To more adequately evaluate the predictive power of the QSAR model, we modified the usual leave-one-out internal validation strategy to employ a leave-one-solute-out strategy and accordingly adjust the Q2 LOO statistic. Skin permeability was shown to have the rank order: water > synthetic > semi-synthetic > soluble oil. In addition, fitted relationships between permeability and solute characteristics differ according to solvents. We demonstrated that the expanded model (r2 = 0.70) improved both the model fit and the predictive power when compared with the simple model (r2 = 0.21).}, number={9}, journal={SAR and QSAR in Environmental Research}, publisher={Informa UK Limited}, author={Xu, G. and Hughes-Oliver, J.M. and Brooks, J.D. and Baynes, R.E.}, year={2013}, month={Sep}, pages={711–731} } @article{xu_hughes-oliver_brooks_yeatts_baynes_2013, title={Selection of appropriate training and validation set chemicals for modelling dermal permeability by U-optimal design}, volume={24}, ISSN={1062-936X 1029-046X}, url={http://dx.doi.org/10.1080/1062936X.2012.742458}, DOI={10.1080/1062936X.2012.742458}, abstractNote={Quantitative structure-activity relationship (QSAR) models are being used increasingly in skin permeation studies. The main idea of QSAR modelling is to quantify the relationship between biological activities and chemical properties, and thus to predict the activity of chemical solutes. As a key step, the selection of a representative and structurally diverse training set is critical to the prediction power of a QSAR model. Early QSAR models selected training sets in a subjective way and solutes in the training set were relatively homogenous. More recently, statistical methods such as D-optimal design or space-filling design have been applied but such methods are not always ideal. This paper describes a comprehensive procedure to select training sets from a large candidate set of 4534 solutes. A newly proposed ‘Baynes’ rule’, which is a modification of Lipinski's ‘rule of five’, was used to screen out solutes that were not qualified for the study. U-optimality was used as the selection criterion. A principal component analysis showed that the selected training set was representative of the chemical space. Gas chromatograph amenability was verified. A model built using the training set was shown to have greater predictive power than a model built using a previous dataset [1].}, number={2}, journal={SAR and QSAR in Environmental Research}, publisher={Informa UK Limited}, author={Xu, G. and Hughes-Oliver, J.M. and Brooks, J.D. and Yeatts, J.L. and Baynes, R.E.}, year={2013}, month={Feb}, pages={135–156} } @article{nail_hughes-oliver_monahan_2011, title={Quantifying Local Creation and Regional Transport Using a Hierarchical Space-Time Model of Ozone as a Function of Observed NOx , a Latent Space-Time VOC Process, Emissions, and Meteorology}, volume={16}, ISSN={["1085-7117"]}, DOI={10.1007/s13253-010-0028-4}, abstractNote={We explore the ability of a process-based space–time model to decompose 8-hour ozone on a given day and site into parts attributable to local emissions and regional transport, to provide space–time predictions, and to assess the efficacy of past and future emission controls. We model ozone as created plus transported plus an error with seasonally varying spatial covariance parameters. Created ozone is a function of the observed NO x concentration, the latent VOC concentration, and solar radiation surrogates. Transported ozone is a weighted average of the ozone observed at all sites on the previous day, where the weights are a function of wind speed and direction. The latent VOC process mean includes emissions, temperature, and a workday indicator, and the error has seasonally varying spatial covariance parameters. Using likelihood methods, we fit the model and obtain one set of predictions appropriate for prediction backward in time, and another appropriate for predicting under hypothetical emission scenarios. The first set of predictions has a lower root-mean-squared error (RMSE) when compared to point observations than do the 36 km gridcell averages from the Community Mesoscale Air Quality Model (CMAQ) used by the EPA; the second set has the same RMSE as CMAQ, but under-predicts high ozone values.}, number={1}, journal={JOURNAL OF AGRICULTURAL BIOLOGICAL AND ENVIRONMENTAL STATISTICS}, author={Nail, A. J. and Hughes-Oliver, J. M. and Monahan, J. F.}, year={2011}, month={Mar}, pages={17–44} } @article{heo_hughes-oliver_2010, title={Uncertainty adjustments to deterministic atmospheric dispersion models}, volume={42}, number={1-3}, journal={International Journal of Environment and Pollution}, author={Heo, T. Y. and Hughes-Oliver, J. M.}, year={2010}, pages={85–106} } @article{zhang_hughes-oliver_ng_2009, title={A Model-Based Ensembling Approach for Developing QSARs}, volume={49}, ISSN={["1549-960X"]}, DOI={10.1021/ci900080f}, abstractNote={Ensemble methods have become popular for QSAR modeling, but most studies have assumed balanced data, consisting of approximately equal numbers of active and inactive compounds. Cheminformatics data are often far from being balanced. We extend the application of ensemble methods to include cases of imbalance of class membership and to more adequately assess model output. Based on the extension, we propose an ensemble method called MBEnsemble that automatically determines the appropriate tuning parameters to provide reliable predictions and maximize the F-measure. Results from multiple data sets demonstrate that the proposed ensemble technique works well on imbalanced data.}, number={8}, journal={JOURNAL OF CHEMICAL INFORMATION AND MODELING}, author={Zhang, Qianyi and Hughes-Oliver, Jacqueline M. and Ng, Raymond T.}, year={2009}, month={Aug}, pages={1857–1865} } @article{hughes-oliver_heo_ghosh_2009, title={An autoregressive point source model for spatial processes}, volume={20}, ISSN={["1099-095X"]}, url={http://www.scopus.com/inward/record.url?eid=2-s2.0-68149096814&partnerID=MN8TOARS}, DOI={10.1002/env.957}, abstractNote={We suggest a parametric modeling approach for nonstationary spatial processes driven by point sources. Baseline near‐stationarity, which may be reasonable in the absence of a point source, is modeled using a conditional autoregressive (CAR) Markov random field. Variability due to the point source is captured by our proposed autoregressive point source (ARPS) model. Inference proceeds according to the Bayesian hierarchical paradigm, and is implemented using Markov chain Monte Carlo (MCMC) methods. The parametric approach allows a formal test of effectiveness of the point source. Application is made to a real dataset on electric potential measurements in a field containing a metal pole and the finding is that our approach captures the pole's impact on small‐scale variability of the electric potential process. Copyright © 2008 John Wiley & Sons, Ltd.}, number={5}, journal={ENVIRONMETRICS}, author={Hughes-Oliver, Jacqueline M. and Heo, Tae-Young and Ghosh, Sujit K.}, year={2009}, month={Aug}, pages={575–594} } @article{liu_hughes-oliver_menius_2007, title={Domain-enhanced analysis of microarray data using GO annotations}, volume={23}, ISSN={["1367-4803"]}, DOI={10.1093/bioinformatics/btm092}, abstractNote={Abstract}, number={10}, journal={BIOINFORMATICS}, author={Liu, Jiajun and Hughes-Oliver, Jacqueline M. and Menius, Alan, Jr.}, year={2007}, month={May}, pages={1225–1234} } @article{remlinger_hughes-oliver_young_lam_2006, title={Statistical design of pools using optimal coverage and minimal collision}, volume={48}, ISSN={["0040-1706"]}, DOI={10.1198/004017005000000481}, abstractNote={Discovery of a new drug involves screening large chemical libraries to identify new and diverse active compounds. Screening efficiency can be improved by testing compounds in pools. We consider two criteria for designing pools: optimal coverage of the chemical space and minimal collision between compounds. We apply four pooling designs to a public dataset. We evaluate each method by determining how well the design criteria are met and whether the methods are able to find many diverse active compounds. One pooling design emerges as a winner, but all designed pools clearly outperform randomly created pools.}, number={1}, journal={TECHNOMETRICS}, author={Remlinger, KS and Hughes-Oliver, JM and Young, SS and Lam, RL}, year={2006}, month={Feb}, pages={133–143} } @article{yi_hughes-oliver_zhu_young_2002, title={A factorial design to optimize cell-based drug discovery analysis}, volume={42}, ISSN={["0095-2338"]}, DOI={10.1021/ci025509n}, abstractNote={Drug discovery is dependent on finding a very small number of biologically active or potent compounds among millions of compounds stored in chemical collections. Quantitative structure-activity relationships suggest that potency of a compound is highly related to that compound's chemical makeup or structure. To improve the efficiency of cell-based analysis methods for high throughput screening, where information of a compound's structure is used to predict potency, we consider a number of potentially influential factors in the cell-based approach. A fractional factorial design is implemented to evaluate the effects of these factors, and lift chart results show that the design scheme is able to find conditions that enhance hit rates.}, number={5}, journal={JOURNAL OF CHEMICAL INFORMATION AND COMPUTER SCIENCES}, author={Yi, BM and Hughes-Oliver, JM and Zhu, L and Young, SS}, year={2002}, pages={1221–1229} } @article{zhu_hughes-oliver_young_2001, title={Statistical decoding of potent pools based on chemical structure}, volume={57}, DOI={10.1111/j.0006-341X.2001.00922.x}, abstractNote={Summary. Pooling experiments are used as a cost‐effective approach for screening chemical compounds as part of the drug discovery process in pharmaceutical companies. When a biologically potent pool is found, the goal is to decode the pool, i.e., to determine which of the individual compounds are potent. We propose augmenting the data on pooled testing with information on the chemical structure of compounds in order to complete the decoding process. This proposal is based on the well‐known relationship between biological potency of a compound and its chemical structure. Application to real data from a drug discovery process at GlaxoSmithKline reveals a 100% increase in hit rate, namely, the number of potent compounds identified divided by the number of tests required.}, number={3}, journal={Biometrics}, author={Zhu, L. and Hughes-Oliver, J. M. and Young, S. S.}, year={2001}, pages={922–930} } @article{hughes-oliver_rosenberger_2000, title={Efficient estimation of the prevalence of multiple rare traits}, volume={87}, ISSN={["0006-3444"]}, DOI={10.1093/biomet/87.2.315}, abstractNote={We consider a population with multiple traits of interest, where our goal is to estimate the proportions of individuals with the traits. When traits are rare, group testing can improve efficiency. Previous work of Hughes-Oliver & Swallow (1994) developed an adaptive two-stage design for group testing of only one trait. We extend this work to the multi-trait case. We derive the optimum group sizes using compound D-optimum design theory. Estimation is based on maximum likelihood estimators, which are shown to be consistent and asymptotically normal. We apply our design to a problem of estimating the prevalence of HIV, chlamydia and syphilis in Ethiopian women.}, number={2}, journal={BIOMETRIKA}, author={Hughes-Oliver, JM and Rosenberger, WF}, year={2000}, month={Jun}, pages={315–327} } @article{boos_hughes-oliver_2000, title={How large does n have to be for Z and t intervals?}, volume={54}, ISSN={["0003-1305"]}, DOI={10.2307/2686030}, number={2}, journal={AMERICAN STATISTICIAN}, author={Boos, DD and Hughes-Oliver, JM}, year={2000}, month={May}, pages={121–128} } @article{su_lu_chen_hughes-oliver_1999, title={A random coefficient degradation model with ramdom sample size}, volume={5}, ISSN={["1572-9249"]}, DOI={10.1023/A:1009653529152}, abstractNote={In testing product reliability, there is often a critical cutoff level that determines whether a specimen is classified as "failed." One consequence is that the number of degradation data collected varies from specimen to specimen. The information of random sample size should be included in the model, and our study shows that it can be influential in estimating model parameters. Two-stage least squares (LS) and maximum modified likelihood (MML) estimation, which both assume fixed sample sizes, are commonly used for estimating parameters in the repeated measurements models typically applied to degradation data. However, the LS estimate is not consistent in the case of random sample sizes. This article derives the likelihood for the random sample size model and suggests using maximum likelihood (ML) for parameter estimation. Our simulation studies show that ML estimates have smaller biases and variances compared to the LS and MML estimates. All estimation methods can be greatly improved if the number of specimens increases from 5 to 10. A data set from a semiconductor application is used to illustrate our methods.}, number={2}, journal={LIFETIME DATA ANALYSIS}, author={Su, C and Lu, JC and Chen, D and Hughes-Oliver, JM}, year={1999}, month={Jun}, pages={173–183} } @article{lu_liu_yin_hughes-oliver_1999, title={Modeling restricted bivariate censored lowflow data}, volume={10}, ISSN={["1180-4009"]}, DOI={10.1002/(SICI)1099-095X(199903/04)10:2<125::AID-ENV340>3.3.CO;2-P}, abstractNote={Environmental studies often result in censored data. In this article, the lowflow quantiles Q*7,2 and Q*7,10 below a limit are treated as censored data. These streamflow quantiles are important for water resources planning and management. Our partial all-subsets censored regression procedure identifies a few important explanatory variables, such as drainage area, basin slope, soil-infiltration index, rainfall index, and some combinations of them. The proposed maximum likelihood estimation method incorporates the restriction Q*7,2≥Q*7,10 and the bivariate probability distribution of the quantiles to improve model quality. Analyses of the lowflow quantiles obtained from streams in West-Central Florida show that our procedure is more appropriate than the commonly used univariate main-effects models in predicting quantiles. Copyright © 1999 John Wiley & Sons, Ltd.}, number={2}, journal={ENVIRONMETRICS}, author={Lu, JC and Liu, SP and Yin, M and Hughes-Oliver, JM}, year={1999}, pages={125–136} } @article{hughes-oliver_gonzalez-farias_1999, title={Parametric covariance models for shock-induced stochastic processes}, volume={77}, ISSN={["0378-3758"]}, DOI={10.1016/S0378-3758(98)00186-4}, abstractNote={A common assumption in modeling stochastic processes is that of weak stationarity. Although this is a convenient and sometimes justifiable assumption for many applications, there are other applications for which it is clearly inappropriate. One such application occurs when the process is driven by action at a limited number of sites, or point sources. Interest may lie not only in predicting the process, but also in assessing the effect of the point sources. In this article we present a general parametric approach of accounting for the effect of point sources in the covariance model of a stochastic process, and we discuss properties of a particular family from this general class. A simulation study demonstrates the performance of parameter estimation using this model, and the predictive ability of this model is shown to be better than some commonly used modeling approaches. Application to a dataset of electromagnetism measurements in a field containing a metal pole shows the advantages of our parametric nonstationary covariance models.}, number={1}, journal={JOURNAL OF STATISTICAL PLANNING AND INFERENCE}, author={Hughes-Oliver, JM and Gonzalez-Farias, G}, year={1999}, month={Feb}, pages={51–72} } @article{hughes-oliver_lu_davis_gyurcsik_1998, title={Achieving uniformity in a semiconductor fabrication process using spatial modeling}, volume={93}, ISSN={["0162-1459"]}, DOI={10.2307/2669600}, abstractNote={Abstract Material is deposited onto the wafer surface during several steps of wafer fabrication. This material must be deposited evenly across the entire wafer surface, close to the targeted thickness, and with little wafer-to-wafer variability. But unequal variances across the wafer and under different process conditions, as well as nonstationary correlation across a wafer, make these goals difficult to achieve, because traditional methods for optimizing deposition processes assume homogeneity and independence. We avoid these assumptions and determine the best settings of process variables using physically motivated statistical models for the mean response, unequal variances, and nonstationary spatial correlation structure. Data from a rapid thermal chemical vapor deposition process is used to illustrate the approach. A simulation exercise demonstrates the advantages of fitting flexible variance models and using appropriate performance measures.}, number={441}, journal={JOURNAL OF THE AMERICAN STATISTICAL ASSOCIATION}, author={Hughes-Oliver, JM and Lu, JC and Davis, JC and Gyurcsik, RS}, year={1998}, month={Mar}, pages={36–45} } @article{boos_hughes-oliver_1998, title={Applications of Basu's theorem}, volume={52}, ISSN={["0003-1305"]}, DOI={10.2307/2685927}, number={3}, journal={AMERICAN STATISTICIAN}, author={Boos, DD and Hughes-Oliver, JM}, year={1998}, month={Aug}, pages={218–221} } @article{chen_lu_hughes-oliver_li_1998, title={Asymptotic properties of maximum likelihood estimates for a bivariate exponential distribution and mixed censored data}, volume={48}, ISSN={["0026-1335"]}, DOI={10.1007/s001840050003}, number={2}, journal={METRIKA}, author={Chen, D and Lu, JC and Hughes-Oliver, JM and Li, CS}, year={1998}, pages={109–125} } @article{hughes-oliver_gonzalez-farias_lu_chen_1998, title={Parametric nonstationary correlation models}, volume={40}, ISSN={["0167-7152"]}, DOI={10.1016/S0167-7152(98)00103-5}, abstractNote={Stochastic processes observed over space often exhibit nonstationarity. Possible causes of nonstationarity include mean drift, heterogeneity of responses, or a correlation pattern that is not simply a function of the Euclidean distance between two spatial locations. This paper considers the latter. The need for nonstationary correlation models has been demonstrated in several application areas, including environmental monitoring of pollutants, and modeling of semiconductor fabrication processes. We present parametric nonstationary correlation models for capturing the effect of point sources. For example, if the response variable is carbon monoxide, then a smoke stack producing carbon monoxide would be considered a point source, and it is unreasonable to believe that correlation would not depend on proximity to the smoke stack. Our parametric models allow the consideration of multiple-point sources, as well as testing the strength of a particular source. These models have the usual anisotropic and isotropic exponential correlation functions as special cases.}, number={3}, journal={STATISTICS & PROBABILITY LETTERS}, author={Hughes-Oliver, JM and Gonzalez-Farias, G and Lu, JC and Chen, D}, year={1998}, month={Oct}, pages={267–278} }