@article{rhodes_davidian_lu_2024, title={Estimation of optimal treatment regimes with electronic medical record data using the residual life value estimator}, volume={2}, ISSN={["1468-4357"]}, DOI={10.1093/biostatistics/kxae002}, abstractNote={Summary Clinicians and patients must make treatment decisions at a series of key decision points throughout disease progression. A dynamic treatment regime is a set of sequential decision rules that return treatment decisions based on accumulating patient information, like that commonly found in electronic medical record (EMR) data. When applied to a patient population, an optimal treatment regime leads to the most favorable outcome on average. Identifying optimal treatment regimes that maximize residual life is especially desirable for patients with life-threatening diseases such as sepsis, a complex medical condition that involves severe infections with organ dysfunction. We introduce the residual life value estimator (ReLiVE), an estimator for the expected value of cumulative restricted residual life under a fixed treatment regime. Building on ReLiVE, we present a method for estimating an optimal treatment regime that maximizes expected cumulative restricted residual life. Our proposed method, ReLiVE-Q, conducts estimation via the backward induction algorithm Q-learning. We illustrate the utility of ReLiVE-Q in simulation studies, and we apply ReLiVE-Q to estimate an optimal treatment regime for septic patients in the intensive care unit using EMR data from the Multiparameter Intelligent Monitoring Intensive Care database. Ultimately, we demonstrate that ReLiVE-Q leverages accumulating patient information to estimate personalized treatment regimes that optimize a clinically meaningful function of residual life.}, journal={BIOSTATISTICS}, author={Rhodes, Grace and Davidian, Marie and Lu, Wenbin}, year={2024}, month={Feb} } @article{wang_cen_huang_qian_dean_ellenberg_fleming_lu_longini_2024, title={Methods for the estimation of direct and indirect vaccination effects by combining data from individual- and cluster-randomized trials}, ISSN={["1097-0258"]}, DOI={10.1002/sim.10030}, abstractNote={Both individually and cluster randomized study designs have been used for vaccine trials to assess the effects of vaccine on reducing the risk of disease or infection. The choice between individually and cluster randomized designs is often driven by the target estimand of interest (eg, direct versus total), statistical power, and, importantly, logistic feasibility. To combat emerging infectious disease threats, especially when the number of events from one single trial may not be adequate to obtain vaccine effect estimates with a desired level of precision, it may be necessary to combine information across multiple trials. In this article, we propose a model formulation to estimate the direct, indirect, total, and overall vaccine effects combining data from trials with two types of study designs: individual‐randomization and cluster‐randomization, based on a Cox proportional hazards model, where the hazard of infection depends on both vaccine status of the individual as well as the vaccine status of the other individuals in the same cluster. We illustrate the use of the proposed model and assess the potential efficiency gain from combining data from multiple trials, compared to using data from each individual trial alone, through two simulation studies, one of which is designed based on a cholera vaccine trial previously carried out in Matlab, Bangladesh.}, journal={STATISTICS IN MEDICINE}, author={Wang, Rui and Cen, Mengqi and Huang, Yunda and Qian, George and Dean, Natalie E. and Ellenberg, Susan S. and Fleming, Thomas R. and Lu, Wenbin and Longini, Ira M.}, year={2024}, month={Feb} } @article{wan_li_lu_song_2024, title={Mining the factor zoo: Estimation of latent factor models with sufficient proxies}, volume={239}, ISSN={["1872-6895"]}, DOI={10.1016/j.jeconom.2022.08.013}, abstractNote={Latent factor model estimation typically relies on either using domain knowledge to manually pick several observed covariates as factor proxies, or purely conducting multivariate analysis such as principal component analysis. However, the former approach may suffer from the bias while the latter cannot incorporate additional information. We propose to bridge these two approaches while allowing the number of factor proxies to diverge, and hence make the latent factor model estimation robust, flexible, and statistically more accurate. As a bonus, the number of factors is also allowed to grow. At the heart of our method is a penalized reduced rank regression to combine information. To further deal with heavy-tailed data, a computationally attractive penalized robust reduced rank regression method is proposed. We establish faster rates of convergence compared with the benchmark. Extensive simulations and real examples are used to illustrate the advantages.}, number={2}, journal={JOURNAL OF ECONOMETRICS}, author={Wan, Runzhe and Li, Yingying and Lu, Wenbin and Song, Rui}, year={2024}, month={Feb} } @article{rhodes_davidian_lu_2023, title={DYNAMIC PREDICTION OF RESIDUAL LIFE WITH LONGITUDINAL COVARIATES USING LONG SHORT-TERM MEMORY NETWORKS}, volume={17}, ISSN={["1941-7330"]}, DOI={10.1214/22-AOAS1706}, abstractNote={Sepsis, a complex medical condition that involves severe infections with life-threatening organ dysfunction, is a leading cause of death worldwide. Treatment of sepsis is highly challenging. When making treatment decisions, clinicians and patients desire accurate predictions of mean residual life (MRL) that leverage all available patient information, including longitudinal biomarker data. Biomarkers are biological, clinical, and other variables reflecting disease progression that are often measured repeatedly on patients in the clinical setting. Dynamic prediction methods leverage accruing biomarker measurements to improve performance, providing updated predictions as new measurements become available. We introduce two methods for dynamic prediction of MRL using longitudinal biomarkers. in both methods, we begin by using long short-term memory networks (LSTMs) to construct encoded representations of the biomarker trajectories, referred to as "context vectors." In our first method, the LSTM-GLM, we dynamically predict MRL via a transformed MRL model that includes the context vectors as covariates. In our second method, the LSTM-NN, we dynamically predict MRL from the context vectors using a feed-forward neural network. We demonstrate the improved performance of both proposed methods relative to competing methods in simulation studies. We apply the proposed methods to dynamically predict the restricted mean residual life (RMRL) of septic patients in the intensive care unit using electronic medical record data. We demonstrate that the LSTM-GLM and the LSTM-NN are useful tools for producing individualized, real-time predictions of RMRL that can help inform the treatment decisions of septic patients.}, number={3}, journal={ANNALS OF APPLIED STATISTICS}, author={Rhodes, Grace and Davidian, Marie and Lu, Wenbin}, year={2023}, month={Sep}, pages={2039–2058} } @article{chen_lu_song_ghosh_2023, title={On Learning and Testing of Counterfactual Fairness through Data Preprocessing}, volume={4}, ISSN={["1537-274X"]}, DOI={10.1080/01621459.2023.2186885}, abstractNote={Machine learning has become more important in real-life decision-making but people are concerned about the ethical problems it may bring when used improperly. Recent work brings the discussion of machine learning fairness into the causal framework and elaborates on the concept of Counterfactual Fairness. In this paper, we develop the Fair Learning through dAta Preprocessing (FLAP) algorithm to learn counterfactually fair decisions from biased training data and formalize the conditions where different data preprocessing procedures should be used to guarantee counterfactual fairness. We also show that Counterfactual Fairness is equivalent to the conditional independence of the decisions and the sensitive attributes given the processed non-sensitive attributes, which enables us to detect discrimination in the original decision using the processed data. The performance of our algorithm is illustrated using simulated data and real-world applications.}, journal={JOURNAL OF THE AMERICAN STATISTICAL ASSOCIATION}, author={Chen, Haoyu and Lu, Wenbin and Song, Rui and Ghosh, Pulak}, year={2023}, month={Apr} } @article{chu_lu_yang_2023, title={Targeted optimal treatment regime learning using summary statistics}, volume={3}, ISSN={["1464-3510"]}, DOI={10.1093/biomet/asad020}, abstractNote={Summary Personalized decision-making, aiming to derive optimal treatment regimes based on individual characteristics, has recently attracted increasing attention in many fields, such as medicine, social services and economics. Current literature mainly focuses on estimating treatment regimes from a single source population. In real-world applications, the distribution of a target population can be different from that of the source population. Therefore, treatment regimes learned by existing methods may not generalize well to the target popu- lation. Because of privacy concerns and other practical issues, individual-level data from the target population are often not available, which makes treatment regime learning more challenging. We consider the problem of treatment regime estimation when the source and target populations may be heterogeneous, individual-level data are available from the source population and only the summary information of covariates, such as moments, is accessible from the target population. We develop a weighting framework that tailors a treatment regime for a given target population by leveraging the available summary statistics. Specifically, we propose a calibrated augmented inverse probability weighted estimator of the value function for the target population and estimate an optimal treatment regime by maximizing this estimator within a class of prespecified regimes. We show that the proposed calibrated estimator is consistent and asymptotically normal even with flexible semi/nonparametric models for nuisance function approximation, and that the variance of the value estimator can be consistently estimated. We demonstrate the empirical performance of the proposed method using simulation studies and a real application using two datasets on sepsis.}, journal={BIOMETRIKA}, author={Chu, J. and Lu, W. and Yang, S.}, year={2023}, month={Mar} } @article{liu_song_lu_xiao_2022, title={A Probit Tensor Factorization Model For Relational Learning}, volume={3}, ISSN={["1537-2715"]}, DOI={10.1080/10618600.2021.2003204}, abstractNote={Abstract With the proliferation of knowledge graphs, modeling data with complex multi-relational structure has gained increasing attention in the area of statistical relational learning. One of the most important goals of statistical relational learning is link prediction, that is, predicting whether certain relations exist in the knowledge graph. A large number of models and algorithms have been proposed to perform link prediction, among which tensor factorization method has proven to achieve state-of-the-art performance in terms of computation efficiency and prediction accuracy. However, a common drawback of the existing tensor factorization models is that the missing relations and nonexisting relations are treated in the same way, which results in a loss of information. To address this issue, we propose a binary tensor factorization model with probit link, which not only inherits the computation efficiency from the classic tensor factorization model but also accounts for the binary nature of relational data. Our proposed probit tensor factorization (PTF) model shows advantages in both the prediction accuracy and interpretability. Supplementary files for this article are available online.}, journal={JOURNAL OF COMPUTATIONAL AND GRAPHICAL STATISTICS}, author={Liu, Ye and Song, Rui and Lu, Wenbin and Xiao, Yanghua}, year={2022}, month={Mar} } @article{johnson_lu_davidian_2022, title={A general framework for subgroup detection via one-step value difference estimation}, volume={8}, ISSN={["1541-0420"]}, DOI={10.1111/biom.13711}, abstractNote={Abstract Recent statistical methodology for precision medicine has focused on either identification of subgroups with enhanced treatment effects or estimating optimal treatment decision rules so that treatment is allocated in a way that maximizes, on average, predefined patient outcomes. Less attention has been given to subgroup testing, which involves evaluation of whether at least a subgroup of the population benefits from an investigative treatment, compared to some control or standard of care. In this work, we propose a general framework for testing for the existence of a subgroup with enhanced treatment effects based on the difference of the estimated value functions under an estimated optimal treatment regime and a fixed regime that assigns everyone to the same treatment. Our proposed test does not require specification of the parametric form of the subgroup and allows heterogeneous treatment effects within the subgroup. The test applies to cases when the outcome of interest is either a time-to-event or a (uncensored) scalar, and is valid at the exceptional law. To demonstrate the empirical performance of the proposed test, we study the type I error and power of the test statistics in simulations and also apply our test to data from a Phase III trial in patients with hematological malignancies.}, journal={BIOMETRICS}, author={Johnson, Dana and Lu, Wenbin and Davidian, Marie}, year={2022}, month={Aug} } @article{yu_lu_yang_ghosh_2022, title={A multiplicative structural nested mean model for zero-inflated outcomes}, volume={8}, ISSN={["1464-3510"]}, DOI={10.1093/biomet/asac050}, abstractNote={Summary Zero-inflated nonnegative outcomes are common in many applications. In this work, motivated by freemium mobile game data, we propose a class of multiplicative structural nested mean models for zero-inflated nonnegative outcomes which flexibly describes the joint effect of a sequence of treatments in the presence of time-varying confounders. The proposed estimator solves a doubly robust estimating equation, where the nuisance functions, namely the propensity score and conditional outcome means given confounders, are estimated parametrically or nonparametrically. To improve the accuracy, we leverage the characteristic of zero-inflated outcomes by estimating the conditional means in two parts, that is, separately modelling the probability of having positive outcomes given confounders, and the mean outcome conditional on its being positive and given the confounders. We show that the proposed estimator is consistent and asymptotically normal as either the sample size or the follow-up time goes to infinity. Moreover, the typical sandwich formula can be used to estimate the variance of treatment effect estimators consistently, without accounting for the variation due to estimating nuisance functions. Simulation studies and an application to a freemium mobile game dataset are presented to demonstrate the empirical performance of the proposed method and support our theoretical findings.}, journal={BIOMETRIKA}, author={Yu, Miao and Lu, Wenbin and Yang, Shu and Ghosh, Pulak}, year={2022}, month={Aug} } @article{cai_lu_west_mehrotra_huang_2022, title={CAPITAL: Optimal subgroup identification via constrained policy tree search}, ISSN={["1097-0258"]}, DOI={10.1002/sim.9507}, abstractNote={Personalized medicine, a paradigm of medicine tailored to a patient's characteristics, is an increasingly attractive field in health care. An important goal of personalized medicine is to identify a subgroup of patients, based on baseline covariates, that benefits more from the targeted treatment than other comparative treatments. Most of the current subgroup identification methods only focus on obtaining a subgroup with an enhanced treatment effect without paying attention to subgroup size. Yet, a clinically meaningful subgroup learning approach should identify the maximum number of patients who can benefit from the better treatment. In this article, we present an optimal subgroup selection rule (SSR) that maximizes the number of selected patients, and in the meantime, achieves the pre‐specified clinically meaningful mean outcome, such as the average treatment effect. We derive two equivalent theoretical forms of the optimal SSR based on the contrast function that describes the treatment‐covariates interaction in the outcome. We further propose a constrained policy tree search algorithm (CAPITAL) to find the optimal SSR within the interpretable decision tree class. The proposed method is flexible to handle multiple constraints that penalize the inclusion of patients with negative treatment effects, and to address time to event data using the restricted mean survival time as the clinically interesting mean outcome. Extensive simulations, comparison studies, and real data applications are conducted to demonstrate the validity and utility of our method.}, journal={STATISTICS IN MEDICINE}, author={Cai, Hengrui and Lu, Wenbin and West, Rachel Marceau and Mehrotra, Devan V and Huang, Lingkang}, year={2022}, month={Jul} } @article{jin_lu_chen_liu_2022, title={Change-plane analysis for subgroup detection with a continuous treatment}, ISSN={["1541-0420"]}, DOI={10.1111/biom.13762}, abstractNote={Abstract Detecting and characterizing subgroups with differential effects of a binary treatment has been widely studied and led to improvements in patient outcomes and population risk management. Under the setting of a continuous treatment, however, such investigations remain scarce. We propose a semiparametric change-plane model and consequently a doubly robust test statistic for assessing the existence of two subgroups with differential treatment effects under a continuous treatment. The proposed testing procedure is valid when either the baseline function for the covariate effects or the generalized propensity score function for the continuous treatment is correctly specified. The asymptotic distributions of the test statistic under the null and local alternative hypotheses are established. When the null hypothesis of no subgroup is rejected, the change-plane parameters that define the subgroups can be estimated. This paper provides a unified framework of the change-plane method to handle various types of outcomes, including the exponential family of distributions and time-to-event outcomes. Additional extensions with nonparametric estimation approaches are also provided. We evaluate the performance of our proposed methods through extensive simulation studies under various scenarios. An application to the Health Effects of Arsenic Longitudinal Study with a continuous environmental exposure of arsenic is presented.}, journal={BIOMETRICS}, author={Jin, Peng and Lu, Wenbin and Chen, Yu and Liu, Mengling}, year={2022}, month={Oct} } @article{li_lu_shu_toh_wang_2022, title={Distributed Cox proportional hazards regression using summary-level information}, ISSN={["1468-4357"]}, DOI={10.1093/biostatistics/kxac006}, abstractNote={SummaryIndividual-level data sharing across multiple sites can be infeasible due to privacy and logistical concerns. This article proposes a general distributed methodology to fit Cox proportional hazards models without sharing individual-level data in multi-site studies. We make inferences on the log hazard ratios based on an approximated partial likelihood score function that uses only summary-level statistics. This approach can be applied to both stratified and unstratified models, accommodate both discrete and continuous exposure variables, and permit the adjustment of multiple covariates. In particular, the fitting of stratified Cox models can be carried out with only one file transfer of summary-level information. We derive the asymptotic properties of the proposed estimators and compare the proposed estimators with the maximum partial likelihood estimators using pooled individual-level data and meta-analysis methods through simulation studies. We apply the proposed method to a real-world data set to examine the effect of sleeve gastrectomy versus Roux-en-Y gastric bypass on the time to first postoperative readmission.}, journal={BIOSTATISTICS}, author={Li, Dongdong and Lu, Wenbin and Shu, Di and Toh, Sengwee and Wang, Rui}, year={2022}, month={Feb} } @article{zhao_ma_lu_2022, title={EFFICIENT ESTIMATION FOR DIMENSION REDUCTION WITH CENSORED SURVIVAL DATA}, volume={32}, ISSN={["1996-8507"]}, DOI={10.5705/ss.202020.0404}, abstractNote={We propose a general index model for survival data, that generalizes many commonly used semiparametric survival models and belongs to the framework of dimension reduction. Using a combination of a geometric approach in semiparametrics and a martingale treatment in survival data analysis, we devise estimation procedures that are feasible and do not require covariate-independent censoring, as assumed in many dimension-reduction methods for censored survival data. We establish the root-n consistency and asymptotic normality of the proposed estimators and derive the most efficient estimator in this class for the general index model. Numerical experiments demonstrate the empirical performance of the proposed estimators, and an application to an AIDS data set further illustrates the usefulness of the work.}, journal={STATISTICA SINICA}, author={Zhao, Ge and Ma, Yanyuan and Lu, Wenbin}, year={2022}, pages={2359–2380} } @article{weaver_xiao_lu_2022, title={Functional data analysis for longitudinal data with informative observation times}, volume={3}, ISSN={["1541-0420"]}, url={https://doi.org/10.1111/biom.13646}, DOI={10.1111/biom.13646}, abstractNote={Abstract In functional data analysis for longitudinal data, the observation process is typically assumed to be noninformative, which is often violated in real applications. Thus, methods that fail to account for the dependence between observation times and longitudinal outcomes may result in biased estimation. For longitudinal data with informative observation times, we find that under a general class of shared random effect models, a commonly used functional data method may lead to inconsistent model estimation while another functional data method results in consistent and even rate-optimal estimation. Indeed, we show that the mean function can be estimated appropriately via penalized splines and that the covariance function can be estimated appropriately via penalized tensor-product splines, both with specific choices of parameters. For the proposed method, theoretical results are provided, and simulation studies and a real data analysis are conducted to demonstrate its performance.}, journal={BIOMETRICS}, publisher={Wiley}, author={Weaver, Caleb and Xiao, Luo and Lu, Wenbin}, year={2022}, month={Mar} } @article{cook_lu_wang_2022, title={Marginal proportional hazards models for clustered interval-censored data with time-dependent covariates}, ISSN={["1541-0420"]}, DOI={10.1111/biom.13787}, abstractNote={Abstract The Botswana Combination Prevention Project was a cluster-randomized HIV prevention trial whose follow-up period coincided with Botswana's national adoption of a universal test and treat strategy for HIV management. Of interest is whether, and to what extent, this change in policy modified the preventative effects of the study intervention. To address such questions, we adopt a stratified proportional hazards model for clustered interval-censored data with time-dependent covariates and develop a composite expectation maximization algorithm that facilitates estimation of model parameters without placing parametric assumptions on either the baseline hazard functions or the within-cluster dependence structure. We show that the resulting estimators for the regression parameters are consistent and asymptotically normal. We also propose and provide theoretical justification for the use of the profile composite likelihood function to construct a robust sandwich estimator for the variance. We characterize the finite-sample performance and robustness of these estimators through extensive simulation studies. Finally, we conclude by applying this stratified proportional hazards model to a re-analysis of the Botswana Combination Prevention Project, with the national adoption of a universal test and treat strategy now modeled as a time-dependent covariate.}, journal={BIOMETRICS}, author={Cook, Kaitlyn and Lu, Wenbin and Wang, Rui}, year={2022}, month={Dec} } @article{huang_callahan_wu_holloway_brochu_lu_peng_tzeng_2022, title={Phylogeny-guided microbiome OTU-specific association test (POST)}, volume={10}, ISSN={["2049-2618"]}, DOI={10.1186/s40168-022-01266-3}, abstractNote={AbstractBackgroundThe relationship between host conditions and microbiome profiles, typically characterized by operational taxonomic units (OTUs), contains important information about the microbial role in human health. Traditional association testing frameworks are challenged by the high dimensionality and sparsity of typical microbiome profiles. Phylogenetic information is often incorporated to address these challenges with the assumption that evolutionarily similar taxa tend to behave similarly. However, this assumption may not always be valid due to the complex effects of microbes, and phylogenetic information should be incorporated in adata-supervisedfashion.ResultsIn this work, we propose a local collapsing test called phylogeny-guided microbiome OTU-specific association test (POST). In POST, whether or not to borrow information and how much information to borrow from the neighboring OTUs in the phylogenetic tree are supervised by phylogenetic distance and the outcome-OTU association. POST is constructed under the kernel machine framework to accommodate complex OTU effects and extends kernel machine microbiome tests from community level to OTU level. Using simulation studies, we show that when the phylogenetic tree is informative, POST has better performance than existing OTU-level association tests. When the phylogenetic tree is not informative, POST achieves similar performance as existing methods. Finally, in real data applications on bacterial vaginosis and on preterm birth, we find that POST can identify similar or more outcome-associated OTUs that are of biological relevance compared to existing methods.ConclusionsUsing POST, we show that adaptively leveraging the phylogenetic information can enhance the selection performance of associated microbiome features by improving the overall true-positive and false-positive detection. We developed a user friendly R packagePOSTmwhich is freely available on CRAN (https://CRAN.R-project.org/package=POSTm).}, number={1}, journal={MICROBIOME}, author={Huang, Caizhi and Callahan, Benjamin John and Wu, Michael C. and Holloway, Shannon T. and Brochu, Hayden and Lu, Wenbin and Peng, Xinxia and Tzeng, Jung-Ying}, year={2022}, month={Jun} } @article{zhou_zhang_lu_2022, title={TransModel: An R Package for Linear Transformation Model with Censored Data}, volume={101}, ISSN={["1548-7660"]}, DOI={10.18637/jss.v101.i09}, abstractNote={Linear transformation models, including the proportional hazards model and proportional odds model, under right censoring were discussed by Chen, Jin, and Ying (2002). The asymptotic variance of the estimator they proposed has a closed form and can be obtained easily by plug-in rules, which improves the computational efficiency. We develop an R package TransModel based on Chen’s approach. The detailed usage of the package is discussed, and the function is applied to the Veterans’ Administration lung cancer data.}, number={9}, journal={JOURNAL OF STATISTICAL SOFTWARE}, author={Zhou, Jie and Zhang, Jiajia and Lu, Wenbin}, year={2022}, month={Jan} } @article{shi_song_lu_2021, title={Concordance and Value Information Criteria for Optimal Treatment Decision}, volume={49}, ISSN={["0090-5364"]}, DOI={10.1214/19-AOS1908}, abstractNote={Personalized medicine is a medical procedure that receives considerable scientific and commercial attention. The goal of personalized medicine is to assign the optimal treatment regime for each individual patient, according to his/her personal prognostic information. When there are a large number of pretreatment variables, it is crucial to identify those important variables that are necessary for treatment decision making. In this paper, we study two information criteria: the concordance and value information criteria, for variable selection in optimal treatment decision making. We consider both fixed-$p$ and high dimensional settings, and show our information criteria are consistent in model/tuning parameter selection. We further apply our information criteria to four estimation approaches, including robust learning, concordance-assisted learning, penalized A-learning and sparse concordance-assisted learning, and demonstrate the empirical performance of our methods by simulations.}, number={1}, journal={Annals of Statistics}, author={Shi, C. and Song, R. and Lu, W.}, year={2021}, month={Feb}, pages={49–75} } @article{cai_song_lu_2021, title={GEAR: On optimal decision making with auxiliary data}, volume={10}, ISSN={["2049-1573"]}, DOI={10.1002/sta4.399}, abstractNote={Personalized optimal decision making, finding the optimal decision rule (ODR) based on individual characteristics, has attracted increasing attention recently in many fields, such as education, economics, and medicine. Current ODR methods usually require the primary outcome of interest in samples for assessing treatment effects, namely, the experimental sample. However, in many studies, treatments may have a long‐term effect, and as such, the primary outcome of interest cannot be observed in the experimental sample due to the limited duration of experiments, which makes the estimation of ODR impossible. This paper is inspired to address this challenge by making use of an auxiliary sample to facilitate the estimation of ODR in the experimental sample. We propose an auGmented inverse propensity weighted Experimental and Auxiliary sample‐based decision Rule (GEAR) by maximizing the augmented inverse propensity weighted value estimator over a class of decision rules using the experimental sample, with the primary outcome being imputed based on the auxiliary sample. The asymptotic properties of the proposed GEAR estimators and their associated value estimators are established. Simulation studies are conducted to demonstrate its empirical validity with a real AIDS application.}, number={1}, journal={STAT}, author={Cai, Hengrui and Song, Rui and Lu, Wenbin}, year={2021}, month={Dec} } @article{chang_yang_lu_huang_huang_hung_miecznikowski_lu_tzeng_2021, title={Gene-set integrative analysis of multi-omics data using tensor-based association test}, volume={37}, ISSN={["1460-2059"]}, DOI={10.1093/bioinformatics/btab125}, abstractNote={Abstract Motivation Facilitated by technological advances and the decrease in costs, it is feasible to gather subject data from several omics platforms. Each platform assesses different molecular events, and the challenge lies in efficiently analyzing these data to discover novel disease genes or mechanisms. A common strategy is to regress the outcomes on all omics variables in a gene set. However, this approach suffers from problems associated with high-dimensional inference. Results We introduce a tensor-based framework for variable-wise inference in multi-omics analysis. By accounting for the matrix structure of an individual’s multi-omics data, the proposed tensor methods incorporate the relationship among omics effects, reduce the number of parameters, and boost the modeling efficiency. We derive the variable-specific tensor test and enhance computational efficiency of tensor modeling. Using simulations and data applications on the Cancer Cell Line Encyclopedia (CCLE), we demonstrate our method performs favorably over baseline methods and will be useful for gaining biological insights in multi-omics analysis. Availability and implementation R function and instruction are available from the authors’ website: https://www4.stat.ncsu.edu/~jytzeng/Software/TR.omics/TRinstruction.pdf. Supplementary information Supplementary data are available at Bioinformatics online. }, number={16}, journal={BIOINFORMATICS}, author={Chang, Sheng-Mao and Yang, Meng and Lu, Wenbin and Huang, Yu-Jyun and Huang, Yueyang and Hung, Hung and Miecznikowski, Jeffrey C. and Lu, Tzu-Pin and Tzeng, Jung-Ying}, year={2021}, month={Aug}, pages={2259–2265} } @article{chen_song_zhang_adams_sun_lu_2021, title={On estimating optimal regime for treatment initiation time based on restricted mean residual lifetime}, volume={8}, ISSN={["1541-0420"]}, DOI={10.1111/biom.13530}, abstractNote={AbstractWhen to initiate treatment on patients is an important problem in many medical studies such as AIDS and cancer. In this article, we formulate the treatment initiation time problem for time‐to‐event data and propose an optimal individualized regime that determines the best treatment initiation time for individual patients based on their characteristics. Different from existing optimal treatment regimes where treatments are undertaken at a pre‐specified time, here new challenges arise from the complicated missing mechanisms in treatment initiation time data and the continuous treatment rule in terms of initiation time. To tackle these challenges, we propose to use restricted mean residual lifetime as a value function to evaluate the performance of different treatment initiation regimes, and develop a nonparametric estimator for the value function, which is consistent even when treatment initiation times are not completely observable and their distribution is unknown. We also establish the asymptotic properties of the resulting estimator in the decision rule and its associated value function estimator. In particular, the asymptotic distribution of the estimated value function is nonstandard, which follows a weighted chi‐squared distribution. The finite‐sample performance of the proposed method is evaluated by simulation studies and is further illustrated with an application to a breast cancer data.}, journal={BIOMETRICS}, author={Chen, Xin and Song, Rui and Zhang, Jiajia and Adams, Swann Arp and Sun, Liuquan and Lu, Wenbin}, year={2021}, month={Aug} } @article{yu_lu_song_2021, title={Online Testing of Subgroup Treatment Effects Based on Value Difference}, ISSN={["1550-4786"]}, DOI={10.1109/ICDM51629.2021.00189}, abstractNote={Online A/B testing plays a critical role in the high-tech industry to guide product development and accelerate innovation. It performs a null hypothesis statistical test to determine which variant is better. However, a typical A/B test presents two problems: (i) a fixed-horizon framework inflates the false-positive errors under continuous monitoring; (ii) the homogeneous effects assumption fails to identify a subgroup with a beneficial treatment effect. In this paper, we propose a sequential test for sub group t reatment effects based on val ue difference, named SUBTLE, to address these two problems simultaneously. The SUBTLE allows the experimenters to “peek” at the results during the experiment without harming the statistical guarantees. It assumes heterogeneous treatment effects and aims to test if some subgroup of the population will benefit from the investigative treatment. If the testing result indicates the existence of such a subgroup, a subgroup will be identified using a readily available estimated optimal treatment rule. We examine the empirical performance of our proposed test on both simulations and a real dataset. The results show that the SUBTLE has high detection power with controlled type I error at any time, is more robust to noise covariates, and can achieve early stopping compared with the corresponding fixed-horizon test.}, journal={2021 21ST IEEE INTERNATIONAL CONFERENCE ON DATA MINING (ICDM 2021)}, author={Yu, Miao and Lu, Wenbin and Song, Rui}, year={2021}, pages={1463–1468} } @article{wang_zhang_cai_lu_tang_2021, title={Semiparametric estimation for proportional hazards mixture cure model allowing non-curable competing risk}, volume={211}, ISSN={["1873-1171"]}, DOI={10.1016/j.jspi.2020.06.009}, abstractNote={With advancements in medical research, broader range of diseases may be curable, which indicates some patients may not die owing to the disease of interest. The mixture cure model, which can capture patients being cured, has received an increasing attention in practice. However, the existing mixture cure models only focus on major events with potential cures while ignoring the potential risks posed by other non-curable competing events, which are commonly observed in the real world. The main purpose of this article is to propose a new mixture cure model allowing non-curable competing risk. A semiparametric estimation method is developed via an EM algorithm, the asymptotic properties of parametric estimators are provided and its performance is demonstrated through comprehensive simulation studies. Finally, the proposed method is applied to a prostate cancer clinical trial dataset.}, journal={JOURNAL OF STATISTICAL PLANNING AND INFERENCE}, author={Wang, Yijun and Zhang, Jiajia and Cai, Chao and Lu, Wenbin and Tang, Yincai}, year={2021}, month={Mar}, pages={171–189} } @article{shi_zhang_lu_song_2021, title={Statistical inference of the value function for reinforcement learning in infinite-horizon settings}, volume={12}, ISSN={["1467-9868"]}, DOI={10.1111/rssb.12465}, abstractNote={AbstractReinforcement learning is a general technique that allows an agent to learn an optimal policy and interact with an environment in sequential decision-making problems. The goodness of a policy is measured by its value function starting from some initial state. The focus of this paper was to construct confidence intervals (CIs) for a policy’s value in infinite horizon settings where the number of decision points diverges to infinity. We propose to model the action-value state function (Q-function) associated with a policy based on series/sieve method to derive its confidence interval. When the target policy depends on the observed data as well, we propose a SequentiAl Value Evaluation (SAVE) method to recursively update the estimated policy and its value estimator. As long as either the number of trajectories or the number of decision points diverges to infinity, we show that the proposed CI achieves nominal coverage even in cases where the optimal policy is not unique. Simulation studies are conducted to back up our theoretical findings. We apply the proposed method to a dataset from mobile health studies and find that reinforcement learning algorithms could help improve patient’s health status. A Python implementation of the proposed procedure is available at https://github.com/shengzhang37/SAVE.}, journal={JOURNAL OF THE ROYAL STATISTICAL SOCIETY SERIES B-STATISTICAL METHODOLOGY}, author={Shi, Chengchun and Zhang, Sheng and Lu, Wenbin and Song, Rui}, year={2021}, month={Dec} } @article{fan_lu_zhou_2021, title={Testing error heterogeneity in censored linear regression}, volume={161}, ISSN={["1872-7352"]}, DOI={10.1016/j.csda.2021.107207}, abstractNote={In censored linear regression, a key assumption is that the error is independent of predictors. We develop an omnibus test to check error heterogeneity in censored linear regression. Our approach is based on testing the variance component in a working kernel machine regression model. The limiting null distribution of the proposed test statistic is shown to be a weighted sum of independent chi-squared distributions with one degree of freedom. A resampling scheme is derived to approximate the null distribution. The empirical performance of the proposed tests is evaluated via simulation and two real data sets.}, journal={COMPUTATIONAL STATISTICS & DATA ANALYSIS}, author={Fan, Caiyun and Lu, Wenbin and Zhou, Yong}, year={2021}, month={Sep} } @article{brucker_lu_west_yu_hsiao_hsiao_lin_magnusson_sullivan_szatkiewicz_et al._2020, title={Association test using Copy Number Profile Curves (CONCUR) enhances power in rare copy number variant analysis}, volume={16}, ISSN={["1553-7358"]}, url={https://doi.org/10.1371/journal.pcbi.1007797}, DOI={10.1371/journal.pcbi.1007797}, abstractNote={Copy number variants (CNVs) are the gain or loss of DNA segments in the genome that can vary in dosage and length. CNVs comprise a large proportion of variation in human genomes and impact health conditions. To detect rare CNV associations, kernel-based methods have been shown to be a powerful tool due to their flexibility in modeling the aggregate CNV effects, their ability to capture effects from different CNV features, and their accommodation of effect heterogeneity. To perform a kernel association test, a CNV locus needs to be defined so that locus-specific effects can be retained during aggregation. However, CNV loci are arbitrarily defined and different locus definitions can lead to different performance depending on the underlying effect patterns. In this work, we develop a new kernel-based test called CONCUR (i.e., copy number profile curve-based association test) that is free from a definition of locus and evaluates CNV-phenotype associations by comparing individuals’ copy number profiles across the genomic regions. CONCUR is built on the proposed concepts of “copy number profile curves” to describe the CNV profile of an individual, and the “common area under the curve (cAUC) kernel” to model the multi-feature CNV effects. The proposed method captures the effects of CNV dosage and length, accounts for the numerical nature of copy numbers, and accommodates between- and within-locus etiological heterogeneity without the need to define artificial CNV loci as required in current kernel methods. In a variety of simulation settings, CONCUR shows comparable or improved power over existing approaches. Real data analyses suggest that CONCUR is well powered to detect CNV effects in the Swedish Schizophrenia Study and the Taiwan Biobank.}, number={5}, journal={PLOS COMPUTATIONAL BIOLOGY}, publisher={Public Library of Science (PLoS)}, author={Brucker, Amanda and Lu, Wenbin and West, Rachel Marceau and Yu, Qi-You and Hsiao, Chuhsing Kate and Hsiao, Tzu-Hung and Lin, Ching-Heng and Magnusson, Patrik K. E. and Sullivan, Patrick F. and Szatkiewicz, Jin P. and et al.}, editor={Ma, JianEditor}, year={2020}, month={May} } @article{jiang_song_li_zeng_lu_he_xu_wang_qian_cheng_et al._2019, title={ENTROPY LEARNING FOR DYNAMIC TREATMENT REGIMES}, volume={29}, ISSN={["1996-8507"]}, DOI={10.5705/ss.202018.0076}, abstractNote={Estimating optimal individualized treatment rules (ITRs) in single or multi-stage clinical trials is one key solution to personalized medicine and has received more and more attention in statistical community. Recent development suggests that using machine learning approaches can significantly improve the estimation over model-based methods. However, proper inference for the estimated ITRs has not been well established in machine learning based approaches. In this paper, we propose a entropy learning approach to estimate the optimal individualized treatment rules (ITRs). We obtain the asymptotic distributions for the estimated rules so further provide valid inference. The proposed approach is demonstrated to perform well in finite sample through extensive simulation studies. Finally, we analyze data from a multi-stage clinical trial for depression patients. Our results offer novel findings that are otherwise not revealed with existing approaches.}, number={4}, journal={STATISTICA SINICA}, author={Jiang, Binyan and Song, Rui and Li, Jialiang and Zeng, Donglin and Lu, Wenbin and He, Xin and Xu, Shirong and Wang, Junhui and Qian, Min and Cheng, Bin and et al.}, year={2019}, month={Oct}, pages={1633–1710} } @article{yu_lu_huang_2020, title={MODELING AND ESTIMATION OF CONTAGION-BASED SOCIAL NETWORK DEPENDENCE WITH TIME-TO-EVENT DATA}, volume={30}, ISSN={["1996-8507"]}, DOI={10.5705/ss.202018.0222}, abstractNote={: Social network data consists of social ties, node characteristics and behaviors over time. It is known that people who are close to each other in a social network are more likely to behave in a similar way. One of the reasons they act similarly is due to the peer influence and social contagion that acts along the network ties. A primary interest of social network data analysis is to identify the contagion-based social correlation. In this work, we model and estimate the contagion-based social network dependence based on time-to-event data. A generalized linear transformation model is proposed for the conditional survival probability at each observed event time, which uses a time-varying covariate to incorporate the network structure and quantify the contagion-based social correlation. We develop the nonparametric maximum likelihood estimation for the proposed model. The consistency and asymptotic normality of the resulting estimators for the regression parameters are established. Simulations are conducted to evaluate the empirical performance of the proposed estimators. We further}, number={4}, journal={STATISTICA SINICA}, author={Yu, Lin and Lu, Wenbin and Huang, Danyang}, year={2020}, month={Oct}, pages={2051–2074} } @article{jeng_peng_lu_2021, title={Model Selection With Mixed Variables on the Lasso Path}, volume={83}, ISSN={["0976-8394"]}, DOI={10.1007/s13571-019-00219-5}, number={1}, journal={SANKHYA-SERIES B-APPLIED AND INTERDISCIPLINARY STATISTICS}, author={Jeng, X. Jessie and Peng, Huimin and Lu, Wenbin}, year={2021}, month={May}, pages={170–184} } @article{gu_zhang_lu_wang_felizzi_2020, title={Semiparametric estimation of the cure fraction in population-based cancer survival analysis}, volume={39}, ISSN={["1097-0258"]}, DOI={10.1002/sim.8693}, abstractNote={With rapid development in medical research, the treatment of diseases including cancer has progressed dramatically and those survivors may die from causes other than the one under study, especially among elderly patients. Motivated by the Surveillance, Epidemiology, and End Results (SEER) female breast cancer study, background mortality is incorporated into the mixture cure proportional hazards (MCPH) model to improve the cure fraction estimation in population‐based cancer studies. Here, that patients are “cured” is defined as when the mortality rate of the individuals in diseased group returns to the same level as that expected in the general population, where the population level mortality is presented by the mortality table of the United States. The semiparametric estimation method based on the EM algorithm for the MCPH model with background mortality (MCPH+BM) is further developed and validated via comprehensive simulation studies. Real data analysis shows that the proposed semiparametric MCPH+BM model may provide more accurate estimation in population‐level cancer study.}, number={26}, journal={STATISTICS IN MEDICINE}, author={Gu, Ennan and Zhang, Jiajia and Lu, Wenbin and Wang, Lianming and Felizzi, Federico}, year={2020}, month={Nov}, pages={3787–3805} } @article{zhou_zhang_mclain_lu_sui_hardin_2020, title={Semiparametric regression of the illness-death model with interval censored disease incidence time: An application to the ACLS data}, volume={29}, ISSN={["1477-0334"]}, DOI={10.1177/0962280220939123}, abstractNote={ To investigate the effect of fitness on cardiovascular disease and all-cause mortality using the Aerobics Center Longitudinal Study, we develop a semiparametric illness-death model account for intermittent observations of the cardiovascular disease incidence time and the right censored data of all-cause mortality. The main challenge in estimation is to handle the intermittent observations (interval censoring) of cardiovascular disease incidence time and we develop a semiparametric estimation method based on the expectation-maximization algorithm for a Markov illness-death regression model. The variance of the parameters is estimated using profile likelihood methods. The proposed method is evaluated using extensive simulation studies and illustrated with an application to the Aerobics Center Longitudinal Study data. }, number={12}, journal={STATISTICAL METHODS IN MEDICAL RESEARCH}, author={Zhou, Jie and Zhang, Jiajia and McLain, Alexander C. and Lu, Wenbin and Sui, Xuemei and Hardin, James W.}, year={2020}, month={Dec}, pages={3707–3720} } @article{shi_song_lu_li_2021, title={Statistical Inference for High-Dimensional Models via Recursive Online-Score Estimation}, volume={116}, ISSN={["1537-274X"]}, DOI={10.1080/01621459.2019.1710154}, abstractNote={Abstract In this article, we develop a new estimation and valid inference method for single or low-dimensional regression coefficients in high-dimensional generalized linear models. The number of the predictors is allowed to grow exponentially fast with respect to the sample size. The proposed estimator is computed by solving a score function. We recursively conduct model selection to reduce the dimensionality from high to a moderate scale and construct the score equation based on the selected variables. The proposed confidence interval (CI) achieves valid coverage without assuming consistency of the model selection procedure. When the selection consistency is achieved, we show the length of the proposed CI is asymptotically the same as the CI of the “oracle” method which works as well as if the support of the control variables were known. In addition, we prove the proposed CI is asymptotically narrower than the CIs constructed based on the desparsified Lasso estimator and the decorrelated score statistic. Simulation studies and real data applications are presented to back up our theoretical findings. Supplementary materials for this article are available online.}, number={535}, journal={JOURNAL OF THE AMERICAN STATISTICAL ASSOCIATION}, author={Shi, Chengchun and Song, Rui and Lu, Wenbin and Li, Runze}, year={2021}, month={Jul}, pages={1307–1318} } @article{chen_lu_song_2021, title={Statistical Inference for Online Decision Making: In a Contextual Bandit Setting}, volume={116}, ISSN={["1537-274X"]}, url={http://dx.doi.org/10.1080/01621459.2020.1770098}, DOI={10.1080/01621459.2020.1770098}, abstractNote={Abstract Online decision making problem requires us to make a sequence of decisions based on incremental information. Common solutions often need to learn a reward model of different actions given the contextual information and then maximize the long-term reward. It is meaningful to know if the posited model is reasonable and how the model performs in the asymptotic sense. We study this problem under the setup of the contextual bandit framework with a linear reward model. The ε-greedy policy is adopted to address the classic exploration-and-exploitation dilemma. Using the martingale central limit theorem, we show that the online ordinary least squares estimator of model parameters is asymptotically normal. When the linear model is misspecified, we propose the online weighted least squares estimator using the inverse propensity score weighting and also establish its asymptotic normality. Based on the properties of the parameter estimators, we further show that the in-sample inverse propensity weighted value estimator is asymptotically normal. We illustrate our results using simulations and an application to a news article recommendation dataset from Yahoo!. Supplementary materials for this article are available online.}, number={533}, journal={JOURNAL OF THE AMERICAN STATISTICAL ASSOCIATION}, publisher={Informa UK Limited}, author={Chen, Haoyu and Lu, Wenbin and Song, Rui}, year={2021}, month={Mar}, pages={240–255} } @article{shi_lu_song_2020, title={A Sparse Random Projection-Based Test for Overall Qualitative Treatment Effects}, volume={115}, ISSN={["1537-274X"]}, DOI={10.1080/01621459.2019.1604368}, abstractNote={Abstract In contrast to the classical “one-size-fits-all” approach, precision medicine proposes the customization of individualized treatment regimes to account for patients’ heterogeneity in response to treatments. Most of existing works in the literature focused on estimating optimal individualized treatment regimes. However, there has been less attention devoted to hypothesis testing regarding the existence of overall qualitative treatment effects, especially when there are a large number of prognostic covariates. When covariates do not have qualitative treatment effects, the optimal treatment regime will assign the same treatment to all patients regardless of their covariate values. In this article, we consider testing the overall qualitative treatment effects of patients’ prognostic covariates in a high-dimensional setting. We propose a sample splitting method to construct the test statistic, based on a nonparametric estimator of the contrast function. When the dimension of covariates is large, we construct the test based on sparse random projections of covariates into a low-dimensional space. We prove the consistency of our test statistic. In the regular cases, we show the asymptotic power function of our test statistic is asymptotically the same as the “oracle” test statistic which is constructed based on the “optimal” projection matrix. Simulation studies and real data applications validate our theoretical findings. Supplementary materials for this article are available online.}, number={531}, journal={JOURNAL OF THE AMERICAN STATISTICAL ASSOCIATION}, author={Shi, Chengchun and Lu, Wenbin and Song, Rui}, year={2020}, month={Jul}, pages={1201–1213} } @article{zhou_zhang_mclain_lu_sui_hardin_2019, title={A varying-coefficient generalized odds rate model with time-varying exposure: An application to fitness and cardiovascular disease mortality}, volume={75}, ISSN={["1541-0420"]}, DOI={10.1111/biom.13057}, abstractNote={Abstract Varying-coefficient models have become a common tool to determine whether and how the association between an exposure and an outcome changes over a continuous measure. These models are complicated when the exposure itself is time-varying and subjected to measurement error. For example, it is well known that longitudinal physical fitness has an impact on cardiovascular disease (CVD) mortality. It is not known, however, how the effect of longitudinal physical fitness on CVD mortality varies with age. In this paper, we propose a varying-coefficient generalized odds rate model that allows flexible estimation of age-modified effects of longitudinal physical fitness on CVD mortality. In our model, the longitudinal physical fitness is measured with error and modeled using a mixed-effects model, and its associated age-varying coefficient function is represented by cubic B-splines. An expectation-maximization algorithm is developed to estimate the parameters in the joint models of longitudinal physical fitness and CVD mortality. A modified pseudoadaptive Gaussian-Hermite quadrature method is adopted to compute the integrals with respect to random effects involved in the E-step. The performance of the proposed method is evaluated through extensive simulation studies and is further illustrated with an application to cohort data from the Aerobic Center Longitudinal Study.}, number={3}, journal={BIOMETRICS}, author={Zhou, Jie and Zhang, Jiajia and Mclain, Alexander C. and Lu, Wenbin and Sui, Xuemei and Hardin, James W.}, year={2019}, month={Sep}, pages={853–863} } @article{west_lu_rotroff_kuenemann_chang_wu_wagner_buse_motsinger-reif_fourches_et al._2019, title={Identifying individual risk rare variants using protein structure guided local tests (POINT)}, volume={15}, ISSN={["1553-7358"]}, DOI={10.1371/journal.pcbi.1006722}, abstractNote={Rare variants are of increasing interest to genetic association studies because of their etiological contributions to human complex diseases. Due to the rarity of the mutant events, rare variants are routinely analyzed on an aggregate level. While aggregation analyses improve the detection of global-level signal, they are not able to pinpoint causal variants within a variant set. To perform inference on a localized level, additional information, e.g., biological annotation, is often needed to boost the information content of a rare variant. Following the observation that important variants are likely to cluster together on functional domains, we propose a protein structure guided local test (POINT) to provide variant-specific association information using structure-guided aggregation of signal. Constructed under a kernel machine framework, POINT performs local association testing by borrowing information from neighboring variants in the 3-dimensional protein space in a data-adaptive fashion. Besides merely providing a list of promising variants, POINT assigns each variant a p-value to permit variant ranking and prioritization. We assess the selection performance of POINT using simulations and illustrate how it can be used to prioritize individual rare variants in PCSK9, ANGPTL4 and CETP in the Action to Control Cardiovascular Risk in Diabetes (ACCORD) clinical trial data.}, number={2}, journal={PLOS COMPUTATIONAL BIOLOGY}, author={West, Rachel Marceau and Lu, Wenbin and Rotroff, Daniel M. and Kuenemann, Melaine A. and Chang, Sheng-Mao and Wu, Michael C. and Wagner, Michael J. and Buse, John B. and Motsinger-Reif, Alison A. and Fourches, Denis and et al.}, year={2019}, month={Feb} } @article{hu_lu_zhou_zhou_2019, title={MM ALGORITHMS FOR VARIANCE COMPONENT ESTIMATION AND SELECTION IN LOGISTIC LINEAR MIXED MODEL}, volume={29}, ISSN={["1996-8507"]}, DOI={10.5705/ss.202017.0220}, abstractNote={Logistic linear mixed models are widely used in experimental designs and genetic analyses of binary traits. Motivated by modern applications, we consider the case of many groups of random effects, where each group corresponds to a variance component. When the number of variance components is large, fitting a logistic linear mixed model is challenging. Thus, we develop two efficient and stable minorization-maximization (MM) algorithms for estimating variance components based on a Laplace approximation of the logistic model. One of these leads to a simple iterative soft-thresholding algorithm for variance component selection using the maximum penalized approximated likelihood. We demonstrate the variance component estimation and selection performance of our algorithms by means of simulation studies and an analysis of real data.}, number={3}, journal={STATISTICA SINICA}, author={Hu, Liuyi and Lu, Wenbin and Zhou, Jin and Zhou, Hua}, year={2019}, month={Jul}, pages={1585–1605} } @article{su_lu_song_2019, title={Modelling and estimation for optimal treatment decision with interference}, volume={8}, ISSN={2049-1573 2049-1573}, url={http://dx.doi.org/10.1002/STA4.219}, DOI={10.1002/STA4.219}, abstractNote={In many network‐based intervention studies, treatment applied on an individual or his or her own characteristics may also affect the outcome of other connected people. We call this interference along network. Approaches for deriving the optimal individualized treatment regimen remain unknown after introducing the effect of interference. In this paper, we propose a novel network‐based regression model that is able to account for interaction between outcomes and treatments in a network. Both Q‐learning and A‐learning methods are derived. We show that the optimal treatment regimen under our model is independent from interference, which makes its application in practice more feasible and appealing. The asymptotic properties of the proposed estimators are established. The performance of the proposed model and methods is illustrated by extensive simulation studies and an application to a mobile game network data.}, number={1}, journal={Stat}, publisher={Wiley}, author={Su, Lin and Lu, Wenbin and Song, Rui}, year={2019}, month={Jan} } @article{shi_song_lu_2019, title={ON TESTING CONDITIONAL QUALITATIVE TREATMENT EFFECTS}, volume={47}, ISBN={0090-5364}, DOI={10.1214/18-AOS1750}, abstractNote={Precision medicine is an emerging medical paradigm that focuses on finding the most effective treatment strategy tailored for individual patients. In the literature, most of the existing works focused on estimating the optimal treatment regime. However, there has been less attention devoted to hypothesis testing regarding the optimal treatment regime. In this paper, we first introduce the notion of conditional qualitative treatment effects (CQTE) of a set of variables given another set of variables and provide a class of equivalent representations for the null hypothesis of no CQTE. The proposed definition of CQTE does not assume any parametric form for the optimal treatment rule and plays an important role for assessing the incremental value of a set of new variables in optimal treatment decision making conditional on an existing set of prescriptive variables. We then propose novel testing procedures for no CQTE based on kernel estimation of the conditional contrast functions. We show that our test statistics have asymptotically correct size and non-negligible power against some nonstandard local alternatives. The empirical performance of the proposed tests are evaluated by simulations and an application to an AIDS data set.}, number={4}, journal={ANNALS OF STATISTICS}, author={Shi, Chengchun and Song, Rui and Lu, Wenbin}, year={2019}, month={Aug}, pages={2348–2377} } @article{zhou_zhang_lu_li_2021, title={On restricted optimal treatment regime estimation for competing risks data}, volume={22}, ISSN={["1468-4357"]}, DOI={10.1093/biostatistics/kxz026}, abstractNote={SUMMARY It is well accepted that individualized treatment regimes may improve the clinical outcomes of interest. However, positive treatment effects are often accompanied by certain side effects. Therefore, when choosing the optimal treatment regime for a patient, we need to consider both efficacy and safety issues. In this article, we propose to model time to a primary event of interest and time to severe side effects of treatment by a competing risks model and define a restricted optimal treatment regime based on cumulative incidence functions. The estimation approach is derived using a penalized value search method and investigated through extensive simulations. The proposed method is applied to an HIV dataset obtained from Health Sciences South Carolina, where we minimize the risk of treatment or virologic failures while controlling the risk of serious drug-induced side effects.}, number={2}, journal={BIOSTATISTICS}, author={Zhou, Jie and Zhang, Jiajia and Lu, Wenbin and Li, Xiaoming}, year={2021}, month={Apr}, pages={217–232} } @article{xiao_zhang_lu_2019, title={Robust regression for optimal individualized treatment rules}, volume={38}, ISSN={0277-6715 1097-0258}, url={http://dx.doi.org/10.1002/SIM.8102}, DOI={10.1002/SIM.8102}, abstractNote={Because different patients may respond quite differently to the same drug or treatment, there is an increasing interest in discovering individualized treatment rules. In particular, there is an emerging need to find optimal individualized treatment rules, which would lead to the “best” clinical outcome. In this paper, we propose a new class of loss functions and estimators based on robust regression to estimate the optimal individualized treatment rules. Compared to existing estimation methods in the literature, the new estimators are novel and advantageous in the following aspects. First, they are robust against skewed, heterogeneous, heavy‐tailed errors or outliers in data. Second, they are robust against a misspecification of the baseline function. Third, under some general situations, the new estimator coupled with the pinball loss approximately maximizes the outcome's conditional quantile instead of the conditional mean, which leads to a more robust optimal individualized treatment rule than the traditional mean‐based estimators. Consistency and asymptotic normality of the proposed estimators are established. Their empirical performance is demonstrated via extensive simulation studies and an analysis of an AIDS data set.}, number={11}, journal={Statistics in Medicine}, publisher={Wiley}, author={Xiao, W. and Zhang, H. H. and Lu, W.}, year={2019}, month={Feb}, pages={2059–2073} } @article{su_lu_song_huang_2020, title={Testing and Estimation of Social Network Dependence With Time to Event Data}, volume={115}, ISSN={["1537-274X"]}, DOI={10.1080/01621459.2019.1617153}, abstractNote={Abstract Nowadays, events are spread rapidly along social networks. We are interested in whether people’s responses to an event are affected by their friends’ characteristics. For example, how soon will a person start playing a game given that his/her friends like it? Studying social network dependence is an emerging research area. In this work, we propose a novel latent spatial autocorrelation Cox model to study social network dependence with time-to-event data. The proposed model introduces a latent indicator to characterize whether a person’s survival time might be affected by his or her friends’ features. We first propose a score-type test for detecting the existence of social network dependence. If it exists, we further develop an EM-type algorithm to estimate the model parameters. The performance of the proposed test and estimators are illustrated by simulation studies and an application to a time-to-event dataset about playing a popular mobile game from one of the largest online social network platforms. Supplementary materials for this article, including a standardized description of the materials available for reproducing the work, are available as an online supplement.}, number={530}, journal={JOURNAL OF THE AMERICAN STATISTICAL ASSOCIATION}, author={Su, Lin and Lu, Wenbin and Song, Rui and Huang, Danyang}, year={2020}, month={Apr}, pages={570–582} } @article{szatkiewicz_marceau_yilmaz_bulik_crowley_mattheisen_sullivan_lu_maity_tzeng_et al._2019, title={VARIANCE COMPONENT TEST FOR CROSS-DISORDER PATHWAY ANALYSIS}, volume={29}, ISSN={["1873-7862"]}, DOI={10.1016/j.euroneuro.2018.08.252}, journal={EUROPEAN NEUROPSYCHOPHARMACOLOGY}, author={Szatkiewicz, Jin and Marceau, Rachel and Yilmaz, Zeynep and Bulik, Cynthia and Crowley, James and Mattheisen, Manuel and Sullivan, Patrick and Lu, Wenbin and Maity, Arnab and Tzeng, Jung-Ying and et al.}, year={2019}, pages={1204–1205} } @article{shi_lu_song_2018, title={A Massive Data Framework for M-Estimators with Cubic-Rate}, volume={113}, ISSN={0162-1459 1537-274X}, url={http://dx.doi.org/10.1080/01621459.2017.1360779}, DOI={10.1080/01621459.2017.1360779}, abstractNote={ABSTRACT The divide and conquer method is a common strategy for handling massive data. In this article, we study the divide and conquer method for cubic-rate estimators under the massive data framework. We develop a general theory for establishing the asymptotic distribution of the aggregated M-estimators using a weighted average with weights depending on the subgroup sample sizes. Under certain condition on the growing rate of the number of subgroups, the resulting aggregated estimators are shown to have faster convergence rate and asymptotic normal distribution, which are more tractable in both computation and inference than the original M-estimators based on pooled data. Our theory applies to a wide class of M-estimators with cube root convergence rate, including the location estimator, maximum score estimator, and value search estimator. Empirical performance via simulations and a real data application also validate our theoretical findings. Supplementary materials for this article are available online.}, number={524}, journal={Journal of the American Statistical Association}, publisher={Informa UK Limited}, author={Shi, Chengchun and Lu, Wenbin and Song, Rui}, year={2018}, month={Jun}, pages={1698–1709} } @article{zhou_zhang_lu_2018, title={Computationally Efficient Estimation for the Generalized Odds Rate Mixture Cure Model With Interval-Censored Data}, volume={27}, ISSN={["1537-2715"]}, DOI={10.1080/10618600.2017.1349665}, abstractNote={ABSTRACT For semiparametric survival models with interval-censored data and a cure fraction, it is often difficult to derive nonparametric maximum likelihood estimation due to the challenge in maximizing the complex likelihood function. In this article, we propose a computationally efficient EM algorithm, facilitated by a gamma-Poisson data augmentation, for maximum likelihood estimation in a class of generalized odds rate mixture cure (GORMC) models with interval-censored data. The gamma-Poisson data augmentation greatly simplifies the EM estimation and enhances the convergence speed of the EM algorithm. The empirical properties of the proposed method are examined through extensive simulation studies and compared with numerical maximum likelihood estimates. An R package “GORCure” is developed to implement the proposed method and its use is illustrated by an application to the Aerobic Center Longitudinal Study dataset. Supplementary material for this article is available online.}, number={1}, journal={JOURNAL OF COMPUTATIONAL AND GRAPHICAL STATISTICS}, author={Zhou, Jie and Zhang, Jiajia and Lu, Wenbin}, year={2018}, pages={48–58} } @article{liang_lu_song_2018, title={Deep advantage learning for optimal dynamic treatment regime}, volume={2}, ISSN={2475-4269 2475-4277}, url={http://dx.doi.org/10.1080/24754269.2018.1466096}, DOI={10.1080/24754269.2018.1466096}, abstractNote={ABSTRACT Recently deep learning has successfully achieved state-of-the-art performance on many difficult tasks. Deep neural networks allow for model flexibility and process features without the need of domain knowledge. Advantage learning (A-learning) is a popular method in dynamic treatment regime (DTR). It models the advantage function, which is of direct relevance to optimal treatment decision. No assumptions on baseline function are made. However, there is a paucity of literature on deep A-learning. In this paper, we present a deep A-learning approach to estimate optimal DTR. We use an inverse probability weighting method to estimate the difference between potential outcomes. Parameter sharing of convolutional neural networks (CNN) greatly reduces the amount of parameters in neural networks, which allows for high scalability. Convexified convolutional neural networks (CCNN) relax the constraints of CNN for optimisation purpose. Different architectures of CNN and CCNN are implemented for contrast function estimation. Both simulation results and application to the STAR*D (Sequenced Treatment Alternatives to Relieve Depression) trial indicate that the proposed methods outperform penalised least square estimator.}, number={1}, journal={Statistical Theory and Related Fields}, publisher={Informa UK Limited}, author={Liang, Shuhan and Lu, Wenbin and Song, Rui}, year={2018}, month={Jan}, pages={80–88} } @article{shi_fan_song_lu_2018, title={HIGH-DIMENSIONAL A-LEARNING FOR OPTIMAL DYNAMIC TREATMENT REGIMES}, volume={46}, ISSN={["0090-5364"]}, DOI={10.1214/17-aos1570}, abstractNote={Precision medicine is a medical paradigm that focuses on finding the most effective treatment decision based on individual patient information. For many complex diseases, such as cancer, treatment decisions need to be tailored over time according to patients' responses to previous treatments. Such an adaptive strategy is referred as a dynamic treatment regime. A major challenge in deriving an optimal dynamic treatment regime arises when an extraordinary large number of prognostic factors, such as patient's genetic information, demographic characteristics, medical history and clinical measurements over time are available, but not all of them are necessary for making treatment decision. This makes variable selection an emerging need in precision medicine. In this paper, we propose a penalized multi-stage A-learning for deriving the optimal dynamic treatment regime when the number of covariates is of the non-polynomial (NP) order of the sample size. To preserve the double robustness property of the A-learning method, we adopt the Dantzig selector which directly penalizes the A-leaning estimating equations. Oracle inequalities of the proposed estimators for the parameters in the optimal dynamic treatment regime and error bounds on the difference between the value functions of the estimated optimal dynamic treatment regime and the true optimal dynamic treatment regime are established. Empirical performance of the proposed approach is evaluated by simulations and illustrated with an application to data from the STAR*D study.}, number={3}, journal={ANNALS OF STATISTICS}, author={Shi, Chengchun and Fan, Ailin and Song, Rui and Lu, Wenbin}, year={2018}, month={Jun}, pages={925–957} } @article{jeng_lu_peng_2018, title={High-dimensional inference for personalized treatment decision}, volume={12}, ISSN={["1935-7524"]}, url={https://europepmc.org/articles/PMC6226259}, DOI={10.1214/18-ejs1439}, abstractNote={Recent development in statistical methodology for personalized treatment decision has utilized high-dimensional regression to take into account a large number of patients' covariates and described personalized treatment decision through interactions between treatment and covariates. While a subset of interaction terms can be obtained by existing variable selection methods to indicate relevant covariates for making treatment decision, there often lacks statistical interpretation of the results. This paper proposes an asymptotically unbiased estimator based on Lasso solution for the interaction coefficients. We derive the limiting distribution of the estimator when baseline function of the regression model is unknown and possibly misspecified. Confidence intervals and p-values are derived to infer the effects of the patients' covariates in making treatment decision. We confirm the accuracy of the proposed method and its robustness against misspecified function in simulation and apply the method to STAR*D study for major depression disorder.}, number={1}, journal={ELECTRONIC JOURNAL OF STATISTICS}, author={Jeng, X. Jessie and Lu, Wenbin and Peng, Huimin}, year={2018}, pages={2074–2089} } @article{shi_song_lu_fu_2018, title={Maximin projection learning for optimal treatment decision with heterogeneous individualized treatment effects}, volume={80}, ISSN={["1467-9868"]}, DOI={10.1111/rssb.12273}, abstractNote={SummaryA salient feature of data from clinical trials and medical studies is inhomogeneity. Patients not only differ in baseline characteristics, but also in the way that they respond to treatment. Optimal individualized treatment regimes are developed to select effective treatments based on patient's heterogeneity. However, the optimal treatment regime might also vary for patients across different subgroups. We mainly consider patients’ heterogeneity caused by groupwise individualized treatment effects assuming the same marginal treatment effects for all groups. We propose a new maximin projection learning method for estimating a single treatment decision rule that works reliably for a group of future patients from a possibly new subpopulation. Based on estimated optimal treatment regimes for all subgroups, the proposed maximin treatment regime is obtained by solving a quadratically constrained linear programming problem, which can be efficiently computed by interior point methods. Consistency and asymptotic normality of the estimator are established. Numerical examples show the reliability of the methodology proposed.}, number={4}, journal={JOURNAL OF THE ROYAL STATISTICAL SOCIETY SERIES B-STATISTICAL METHODOLOGY}, author={Shi, Chengchun and Song, Rui and Lu, Wenbin and Fu, Bo}, year={2018}, month={Sep}, pages={681–702} } @article{kang_lu_zhang_2018, title={ON ESTIMATION OF THE OPTIMAL TREATMENT REGIME WITH THE ADDITIVE HAZARDS MODEL}, volume={28}, ISSN={["1996-8507"]}, DOI={10.5705/ss.202016.0543}, abstractNote={We propose a doubly robust estimation method for the optimal treatment regime based on an additive hazards model with censored survival data. Specifically, we introduce a new semiparametric additive hazard model which allows flexible baseline covariate effects in the control group and incorporates marginal treatment effect and its linear interaction with covariates. In addition, we propose a time-dependent propensity score to construct an A-learning type of estimating equations. The resulting estimator is shown to be consistent and asymptotically normal when either the baseline effect model for covariates or the propensity score is correctly specified. The asymptotic variance of the estimator is consistently estimated using a simple resampling method. Simulation studies are conducted to evaluate the finite-sample performance of the estimators and an application to AIDS clinical trial data is also given to illustrate the methodology.}, number={3}, journal={STATISTICA SINICA}, author={Kang, Suhyun and Lu, Wenbin and Zhang, Jiajia}, year={2018}, month={Jul}, pages={1539–1560} } @article{liang_lu_song_wang_2018, title={Sparse concordance-assisted learning for optimal treatment decision}, volume={18}, journal={Journal of Machine Learning Research}, author={Liang, S. H. and Lu, W. B. and Song, R. and Wang, L.}, year={2018} } @article{chen_lu_zhao_2018, title={An improved survival estimator for censored medical costs with a kernel approach}, volume={47}, ISSN={["1532-415X"]}, DOI={10.1080/03610926.2017.1400059}, abstractNote={ABSTRACT Cost assessment serves as an essential part in economic evaluation of medical interventions. In many studies, costs as well as survival data are frequently censored. Standard survival analysis techniques are often invalid for censored costs, due to the induced dependent censoring problem. Owing to high skewness in many cost data, it is desirable to estimate the median costs, which will be available with estimated survival function of costs. We propose a kernel-based survival estimator for costs, which is monotone, consistent, and more efficient than several existing estimators. We conduct numerical studies to examine the finite-sample performance of the proposed estimator.}, number={23}, journal={COMMUNICATIONS IN STATISTICS-THEORY AND METHODS}, author={Chen, Shuai and Lu, Wenbin and Zhao, Hongwei}, year={2018}, pages={5702–5716} } @article{fan_song_lu_2017, title={Change-Plane Analysis for Subgroup Detection and Sample Size Calculation}, volume={112}, ISSN={0162-1459 1537-274X}, url={http://dx.doi.org/10.1080/01621459.2016.1166115}, DOI={10.1080/01621459.2016.1166115}, abstractNote={ABSTRACT We propose a systematic method for testing and identifying a subgroup with an enhanced treatment effect. We adopts a change-plane technique to first test the existence of a subgroup, and then identify the subgroup if the null hypothesis on nonexistence of such a subgroup is rejected. A semiparametric model is considered for the response with an unspecified baseline function and an interaction between a subgroup indicator and treatment. A doubly robust test statistic is constructed based on this model, and asymptotic distributions of the test statistic under both null and local alternative hypotheses are derived. Moreover, a sample size calculation method for subgroup detection is developed based on the proposed statistic. The finite sample performance of the proposed test is evaluated via simulations. Finally, the proposed methods for subgroup identification and sample size calculation are applied to a data from an AIDS study.}, number={518}, journal={Journal of the American Statistical Association}, publisher={Informa UK Limited}, author={Fan, Ailin and Song, Rui and Lu, Wenbin}, year={2017}, month={Apr}, pages={769–778} } @article{jiang_lu_song_hudgens_naprvavnik_2017, title={DOUBLY ROBUST ESTIMATION OF OPTIMAL TREATMENT REGIMES FOR SURVIVAL DATA-WITH APPLICATION TO AN HIV/AIDS STUDY}, volume={11}, ISSN={["1932-6157"]}, DOI={10.1214/17-aoas1057}, abstractNote={In many biomedical settings, assigning every patient the same treatment may not be optimal due to patient heterogeneity. Individualized treatment regimes have the potential to dramatically improve clinical outcomes. When the primary outcome is censored survival time, a main interest is to find optimal treatment regimes that maximize the survival probability of patients. Since the survival curve is a function of time, it is important to balance short-term and long-term benefit when assigning treatments. In this paper, we propose a doubly robust approach to estimate optimal treatment regimes that optimize a user specified function of the survival curve, including the restricted mean survival time and the median survival time. The empirical and asymptotic properties of the proposed method are investigated. The proposed method is applied to a data set from an ongoing HIV/AIDS clinical observational study conducted by the University of North Carolina (UNC) Center of AIDS Research (CFAR), and shows the proposed methods significantly improve the restricted mean time of the initial treatment duration. Finally, the proposed methods are extended to multi-stage studies.}, number={3}, journal={ANNALS OF APPLIED STATISTICS}, author={Jiang, Runchao and Lu, Wenbin and Song, Rui and Hudgens, Michael G. and Naprvavnik, Sonia}, year={2017}, month={Sep}, pages={1763–1786} } @article{song_luo_zeng_zhang_lu_li_2017, title={Semiparametric single-index model for estimating optimal individualized treatment strategy}, volume={11}, ISSN={["1935-7524"]}, DOI={10.1214/17-ejs1226}, abstractNote={Different from the standard treatment discovery framework which is used for finding single treatments for a homogenous group of patients, personalized medicine involves finding therapies that are tailored to each individual in a heterogeneous group. In this paper, we propose a new semiparametric additive single-index model for estimating individualized treatment strategy. The model assumes a flexible and nonparametric link function for the interaction between treatment and predictive covariates. We estimate the rule via monotone B-splines and establish the asymptotic properties of the estimators. Both simulations and an real data application demonstrate that the proposed method has a competitive performance.}, number={1}, journal={ELECTRONIC JOURNAL OF STATISTICS}, author={Song, Rui and Luo, Shikai and Zeng, Donglin and Zhang, Hao Helen and Lu, Wenbin and Li, Zhiguo}, year={2017}, pages={364–384} } @article{kang_lu_song_2017, title={Subgroup detection and sample size calculation with proportional hazards regression for survival data}, volume={36}, ISSN={0277-6715}, url={http://dx.doi.org/10.1002/sim.7441}, DOI={10.1002/sim.7441}, abstractNote={In this paper, we propose a testing procedure for detecting and estimating the subgroup with an enhanced treatment effect in survival data analysis. Here, we consider a new proportional hazard model that includes a nonparametric component for the covariate effect in the control group and a subgroup‐treatment–interaction effect defined by a change plane. We develop a score‐type test for detecting the existence of the subgroup, which is doubly robust against misspecification of the baseline effect model or the propensity score but not both under mild assumptions for censoring. When the null hypothesis of no subgroup is rejected, the change‐plane parameters that define the subgroup can be estimated on the basis of supremum of the normalized score statistic. The asymptotic distributions of the proposed test statistic under the null and local alternative hypotheses are established. On the basis of established asymptotic distributions, we further propose a sample size calculation formula for detecting a given subgroup effect and derive a numerical algorithm for implementing the sample size calculation in clinical trial designs. The performance of the proposed approach is evaluated by simulation studies. An application to an AIDS clinical trial data is also given for illustration.}, number={29}, journal={Statistics in Medicine}, publisher={Wiley}, author={Kang, Suhyun and Lu, Wenbin and Song, Rui}, year={2017}, month={Aug}, pages={4646–4659} } @article{zhou_zhang_lu_2017, title={An Expectation Maximization algorithm for fitting the generalized odds-rate model to interval censored data}, volume={36}, ISSN={["1097-0258"]}, DOI={10.1002/sim.7204}, abstractNote={The generalized odds‐rate model is a class of semiparametric regression models, which includes the proportional hazards and proportional odds models as special cases. There are few works on estimation of the generalized odds‐rate model with interval censored data because of the challenges in maximizing the complex likelihood function. In this paper, we propose a gamma‐Poisson data augmentation approach to develop an Expectation Maximization algorithm, which can be used to fit the generalized odds‐rate model to interval censored data. The proposed Expectation Maximization algorithm is easy to implement and is computationally efficient. The performance of the proposed method is evaluated by comprehensive simulation studies and illustrated through applications to datasets from breast cancer and hemophilia studies. In order to make the proposed method easy to use in practice, an R package ‘ICGOR’ was developed. Copyright © 2016 John Wiley & Sons, Ltd.}, number={7}, journal={STATISTICS IN MEDICINE}, author={Zhou, Jie and Zhang, Jiajia and Lu, Wenbin}, year={2017}, month={Mar}, pages={1157–1171} } @article{fan_lu_song_zhou_2016, title={Concordance-assisted learning for estimating optimal individualized treatment regimes}, volume={79}, ISSN={1369-7412}, url={http://dx.doi.org/10.1111/rssb.12216}, DOI={10.1111/rssb.12216}, abstractNote={Summary We propose new concordance-assisted learning for estimating optimal individualized treatment regimes. We first introduce a type of concordance function for prescribing treatment and propose a robust rank regression method for estimating the concordance function. We then find treatment regimes, up to a threshold, to maximize the concordance function, named the prescriptive index. Finally, within the class of treatment regimes that maximize the concordance function, we find the optimal threshold to maximize the value function. We establish the rate of convergence and asymptotic normality of the proposed estimator for parameters in the prescriptive index. An induced smoothing method is developed to estimate the asymptotic variance of the estimator. We also establish the n1/3-consistency of the estimated optimal threshold and its limiting distribution. In addition, a doubly robust estimator of parameters in the prescriptive index is developed under a class of monotonic index models. The practical use and effectiveness of the methodology proposed are demonstrated by simulation studies and an application to an acquired immune deficiency syndrome data set.}, number={5}, journal={Journal of the Royal Statistical Society: Series B (Statistical Methodology)}, publisher={Wiley}, author={Fan, Caiyun and Lu, Wenbin and Song, Rui and Zhou, Yong}, year={2016}, month={Oct}, pages={1565–1582} } @article{kang_lu_liu_2017, title={Efficient Estimation for Accelerated Failure Time Model under Case-Cohort and Nested Case-Control Sampling}, volume={73}, ISSN={["1541-0420"]}, DOI={10.1111/biom.12573}, abstractNote={Summary Case-cohort (Prentice, 1986) and nested case-control (Thomas, 1977) designs have been widely used as a cost-effective alternative to the full-cohort design. In this article, we propose an efficient likelihood-based estimation method for the accelerated failure time model under case-cohort and nested case-control designs. An EM algorithm is developed to maximize the likelihood function and a kernel smoothing technique is adopted to facilitate the estimation in the M-step of the EM algorithm. We show that the proposed estimators for the regression coefficients are consistent and asymptotically normal. The asymptotic variance of the estimators can be consistently estimated using an EM-aided numerical differentiation method. Simulation studies are conducted to evaluate the finite-sample performance of the estimators and an application to a Wilms tumor data set is also given to illustrate the methodology.}, number={1}, journal={BIOMETRICS}, author={Kang, Suhyun and Lu, Wenbin and Liu, Mengling}, year={2017}, month={Mar}, pages={114–123} } @article{jiang_lu_song_davidian_2016, title={On estimation of optimal treatment regimes for maximizing t -year survival probability}, volume={79}, ISSN={1369-7412}, url={http://dx.doi.org/10.1111/rssb.12201}, DOI={10.1111/rssb.12201}, abstractNote={Summary A treatment regime is a deterministic function that dictates personalized treatment based on patients’ individual prognostic information. There is increasing interest in finding optimal treatment regimes, which determine treatment at one or more treatment decision points to maximize expected long-term clinical outcomes, where larger outcomes are preferred. For chronic diseases such as cancer or human immunodeficiency virus infection, survival time is often the outcome of interest, and the goal is to select treatment to maximize survival probability. We propose two non-parametric estimators for the survival function of patients following a given treatment regime involving one or more decisions, i.e. the so-called value. On the basis of data from a clinical or observational study, we estimate an optimal regime by maximizing these estimators for the value over a prespecified class of regimes. Because the value function is very jagged, we introduce kernel smoothing within the estimator to improve performance. Asymptotic properties of the proposed estimators of value functions are established under suitable regularity conditions, and simulation studies evaluate the finite sample performance of the regime estimators. The methods are illustrated by application to data from an acquired immune deficiency syndrome clinical trial.}, number={4}, journal={Journal of the Royal Statistical Society: Series B (Statistical Methodology)}, publisher={Wiley}, author={Jiang, Runchao and Lu, Wenbin and Song, Rui and Davidian, Marie}, year={2016}, month={Sep}, pages={1165–1185} } @article{bai_tsiatis_lu_song_2017, title={Optimal treatment regimes for survival endpoints using locally-efficient doubly-robust estimator from a classification perspective}, volume={23}, ISSN={["1572-9249"]}, DOI={10.1007/s10985-016-9376-x}, abstractNote={A treatment regime at a single decision point is a rule that assigns a treatment, among the available options, to a patient based on the patient's baseline characteristics. The value of a treatment regime is the average outcome of a population of patients if they were all treated in accordance to the treatment regime, where large values are desirable. The optimal treatment regime is a regime which results in the greatest value. Typically, the optimal treatment regime is estimated by positing a regression relationship for the outcome of interest as a function of treatment and baseline characteristics. However, this can lead to suboptimal treatment regimes when the regression model is misspecified. We instead consider value search estimators for the optimal treatment regime where we directly estimate the value for any treatment regime and then maximize this estimator over a class of regimes. For many studies the primary outcome of interest is survival time which is often censored. We derive a locally efficient, doubly robust, augmented inverse probability weighted complete case estimator for the value function with censored survival data and study the large sample properties of this estimator. The optimization is realized from a weighted classification perspective that allows us to use available off the shelf software. In some studies one treatment may have greater toxicity or side effects, thus we also consider estimating a quality adjusted optimal treatment regime that allows a patient to trade some additional risk of death in order to avoid the more invasive treatment.}, number={4}, journal={Lifetime Data Analysis}, author={Bai, X. and Tsiatis, A. and Lu, W. and Song, R.}, year={2017}, pages={585–604} } @article{goldberg_lu_fine_2016, title={Oracle estimation of parametric transformation models}, volume={10}, ISSN={["1935-7524"]}, DOI={10.1214/15-ejs1083}, abstractNote={: Transformation models, like the Box-Cox transformation, are widely used in regression to reduce non-additivity, non-normality, and heteroscedasticity. The question of whether one may or may not treat the es- timated transformation parameter as fixed in inference about other model parameters has a long and controversial history (Bickel and Doksum, 1981, Hinkley and Runger, 1984). While the frequentist wisdom is that uncertainty regarding the true value of the transformation parameter cannot be ignored, in practice, difficulties in interpretation arise if the transformation is regarded as random and not fixed. In this paper, we suggest a golden mean methodology which attempts to reconcile these philosophies. Penalized estimation yields oracle estimates of transformations that enable treating the transformation parameter as known when the data indicate one of a prespecified set of transformations of scientific interest. When the true transformation is outside this set, rigorous frequentist inference is still achieved. The methodology permits multiple candidate values for the transformation, as is common in applications, as well as simultaneously accommodating variable selection in regression model. Theoretical issues, such as selection consistency and the oracle property, are rigorously estab-lished. Numerical studies, including extensive simulation studies and real data examples, illustrate the practical utility of the proposed methods.}, number={1}, journal={ELECTRONIC JOURNAL OF STATISTICS}, author={Goldberg, Yair and Lu, Wenbin and Fine, Jason}, year={2016}, pages={90–120} } @article{jeng_daye_lu_tzeng_2016, title={Rare variants association analysis in large-scale sequencing studies at the single locus level}, volume={12}, number={6}, journal={PLoS Computational Biology}, author={Jeng, X. J. and Daye, Z. J. and Lu, W. B. and Tzeng, J. Y.}, year={2016} } @article{shi_song_lu_2016, title={Robust learning for optimal treatment decision with NP-dimensionality}, volume={10}, ISSN={["1935-7524"]}, DOI={10.1214/16-ejs1178}, abstractNote={In order to identify important variables that are involved in making optimal treatment decision, Lu, Zhang and Zeng (2013) proposed a penalized least squared regression framework for a fixed number of predictors, which is robust against the misspecification of the conditional mean model. Two problems arise: (i) in a world of explosively big data, effective methods are needed to handle ultra-high dimensional data set, for example, with the dimension of predictors is of the non-polynomial (NP) order of the sample size; (ii) both the propensity score and conditional mean models need to be estimated from data under NP dimensionality. In this paper, we propose a robust procedure for estimating the optimal treatment regime under NP dimensionality. In both steps, penalized regressions are employed with the non-concave penalty function, where the conditional mean model of the response given predictors may be misspecified. The asymptotic properties, such as weak oracle properties, selection consistency and oracle distributions, of the proposed estimators are investigated. In addition, we study the limiting distribution of the estimated value function for the obtained optimal treatment regime. The empirical performance of the proposed estimation method is evaluated by simulations and an application to a depression dataset from the STAR*D study.}, number={2}, journal={ELECTRONIC JOURNAL OF STATISTICS}, author={Shi, Chengchun and Song, Rui and Lu, Wenbin}, year={2016}, pages={2894–2921} } @article{fan_lu_song_2016, title={SEQUENTIAL ADVANTAGE SELECTION FOR OPTIMAL TREATMENT REGIME}, volume={10}, ISSN={["1932-6157"]}, DOI={10.1214/15-aoas849}, abstractNote={Variable selection for optimal treatment regime in a clinical trial or an observational study is getting more attention. Most existing variable selection techniques focused on selecting variables that are important for prediction, therefore some variables that are poor in prediction but are critical for decision-making may be ignored. A qualitative interaction of a variable with treatment arises when treatment effect changes direction as the value of this variable varies. The qualitative interaction indicates the importance of this variable for decision-making. Gunter, Zhu and Murphy (2011) proposed S-score which characterizes the magnitude of qualitative interaction of each variable with treatment individually. In this article, we developed a sequential advantage selection method based on the modified S-score. Our method selects qualitatively interacted variables sequentially, and hence excludes marginally important but jointly unimportant variables or vice versa. The optimal treatment regime based on variables selected via joint model is more comprehensive and reliable. With the proposed stopping criteria, our method can handle a large amount of covariates even if sample size is small. Simulation results show our method performs well in practical settings. We further applied our method to data from a clinical trial for depression.}, number={1}, journal={ANNALS OF APPLIED STATISTICS}, author={Fan, Ailin and Lu, Wenbin and Song, Rui}, year={2016}, month={Mar}, pages={32–53} } @article{marceau_lu_holloway_sale_worrall_williams_hsu_tzeng_2015, title={A Fast Multiple-Kernel Method With Applications to Detect Gene-Environment Interaction}, volume={39}, ISSN={["1098-2272"]}, DOI={10.1002/gepi.21909}, abstractNote={ABSTRACTKernel machine (KM) models are a powerful tool for exploring associations between sets of genetic variants and complex traits. Although most KM methods use a single kernel function to assess the marginal effect of a variable set, KM analyses involving multiple kernels have become increasingly popular. Multikernel analysis allows researchers to study more complex problems, such as assessing gene‐gene or gene‐environment interactions, incorporating variance‐component based methods for population substructure into rare‐variant association testing, and assessing the conditional effects of a variable set adjusting for other variable sets. The KM framework is robust, powerful, and provides efficient dimension reduction for multifactor analyses, but requires the estimation of high dimensional nuisance parameters. Traditional estimation techniques, including regularization and the “expectation‐maximization (EM)” algorithm, have a large computational cost and are not scalable to large sample sizes needed for rare variant analysis. Therefore, under the context of gene‐environment interaction, we propose a computationally efficient and statistically rigorous “fastKM” algorithm for multikernel analysis that is based on a low‐rank approximation to the nuisance effect kernel matrices. Our algorithm is applicable to various trait types (e.g., continuous, binary, and survival traits) and can be implemented using any existing single‐kernel analysis software. Through extensive simulation studies, we show that our algorithm has similar performance to an EM‐based KM approach for quantitative traits while running much faster. We also apply our method to the Vitamin Intervention for Stroke Prevention (VISP) clinical trial, examining gene‐by‐vitamin effects on recurrent stroke risk and gene‐by‐age effects on change in homocysteine level.}, number={6}, journal={GENETIC EPIDEMIOLOGY}, author={Marceau, Rachel and Lu, Wenbin and Holloway, Shannon and Sale, Michele M. and Worrall, Bradford B. and Williams, Stephen R. and Hsu, Fang-Chi and Tzeng, Jung-Ying}, year={2015}, month={Sep}, pages={456–468} } @article{wang_zhang_lu_2015, title={Authors' Reply to comments on ‘Sample size calculation for the proportional hazards cure model’}, volume={34}, ISSN={0277-6715}, url={http://dx.doi.org/10.1002/SIM.6491}, DOI={10.1002/SIM.6491}, abstractNote={Statistics in MedicineVolume 34, Issue 17 p. 2578-2580 Author's Reply Authors' Reply to comments on ‘Sample size calculation for the proportional hazards cure model’ Songfeng Wang, Songfeng Wang Department of Epidemiology and Biostatistics, University of South Carolina, Columbia, SC, 29208 USASearch for more papers by this authorJiajia Zhang, Jiajia Zhang Department of Epidemiology and Biostatistics, University of South Carolina, Columbia, SC, 29208 USASearch for more papers by this authorWenbin Lu, Wenbin Lu Department of Statistics, North Carolina State University, Raleigh, NC, 27695 USASearch for more papers by this author Songfeng Wang, Songfeng Wang Department of Epidemiology and Biostatistics, University of South Carolina, Columbia, SC, 29208 USASearch for more papers by this authorJiajia Zhang, Jiajia Zhang Department of Epidemiology and Biostatistics, University of South Carolina, Columbia, SC, 29208 USASearch for more papers by this authorWenbin Lu, Wenbin Lu Department of Statistics, North Carolina State University, Raleigh, NC, 27695 USASearch for more papers by this author First published: 01 July 2015 https://doi.org/10.1002/sim.6491Read the full textAboutPDF ToolsRequest permissionExport citationAdd to favoritesTrack citation ShareShare Give accessShare full text accessShare full-text accessPlease review our Terms and Conditions of Use and check box below to share full-text version of article.I have read and accept the Wiley Online Library Terms and Conditions of UseShareable LinkUse the link below to share a full-text version of this article with your friends and colleagues. Learn more.Copy URL Share a linkShare onEmailFacebookTwitterLinkedInRedditWechat Volume34, Issue1730 July 2015Pages 2578-2580 RelatedInformation}, number={17}, journal={Statistics in Medicine}, publisher={Wiley}, author={Wang, Songfeng and Zhang, Jiajia and Lu, Wenbin}, year={2015}, month={Jul}, pages={2578–2580} } @article{guo_li_lu_li_2015, title={Groupwise Dimension Reduction via Envelope Method}, volume={110}, ISSN={0162-1459 1537-274X}, url={http://dx.doi.org/10.1080/01621459.2014.970687}, DOI={10.1080/01621459.2014.970687}, abstractNote={The family of sufficient dimension reduction (SDR) methods that produce informative combinations of predictors, or indices, are particularly useful for high-dimensional regression analysis. In many such analyses, it becomes increasingly common that there is available a priori subject knowledge of the predictors; for example, they belong to different groups. While many recent SDR proposals have greatly expanded the scope of the methods’ applicability, how to effectively incorporate the prior predictor structure information remains a challenge. In this article, we aim at dimension reduction that recovers full regression information while preserving the predictor group structure. Built upon a new concept of the direct sum envelope, we introduce a systematic way to incorporate the group information in most existing SDR estimators. As a result, the reduction outcomes are much easier to interpret. Moreover, the envelope method provides a principled way to build a variety of prior structures into dimension reduction analysis. Both simulations and real data analysis demonstrate the competent numerical performance of the new method.}, number={512}, journal={Journal of the American Statistical Association}, publisher={Informa UK Limited}, author={Guo, Zifang and Li, Lexin and Lu, Wenbin and Li, Bing}, year={2015}, month={Oct}, pages={1515–1527} } @article{cheng_lu_liu_2015, title={Identification of homogeneous and heterogeneous variables in pooled cohort studies}, volume={71}, ISSN={["1541-0420"]}, DOI={10.1111/biom.12285}, abstractNote={Summary Pooled analyses integrate data from multiple studies and achieve a larger sample size for enhanced statistical power. When heterogeneity exists in variables’ effects on the outcome across studies, the simple pooling strategy fails to present a fair and complete picture of the effects of heterogeneous variables. Thus, it is important to investigate the homogeneous and heterogeneous structure of variables in pooled studies. In this article, we consider the pooled cohort studies with time-to-event outcomes and propose a penalized Cox partial likelihood approach with adaptively weighted composite penalties on variables’ homogeneous and heterogeneous effects. We show that our method can characterize the variables as having heterogeneous, homogeneous, or null effects, and estimate non-zero effects. The results are readily extended to high-dimensional applications where the number of parameters is larger than the sample size. The proposed selection and estimation procedure can be implemented using the iterative shooting algorithm. We conduct extensive numerical studies to evaluate the performance of our proposed method and demonstrate it using a pooled analysis of gene expression in patients with ovarian cancer.}, number={2}, journal={BIOMETRICS}, author={Cheng, Xin and Lu, Wenbin and Liu, Mengling}, year={2015}, month={Jun}, pages={397–403} } @article{pang_lu_wang_2015, title={local buckley-james estimation for heteroscedastic accelerated failure time model}, volume={25}, number={3}, journal={Statistica Sinica}, author={Pang, L. and Lu, W. B. and Wang, H. J.}, year={2015}, pages={863–877} } @article{geng_lu_zhang_2014, title={A model-free machine learning method for risk classification and survival probability prediction}, volume={3}, ISSN={2049-1573}, url={http://dx.doi.org/10.1002/STA4.67}, DOI={10.1002/STA4.67}, abstractNote={Risk classification and survival probability prediction are two major goals in survival data analysis because they play an important role in patients' risk stratification, long‐term diagnosis, and treatment selection. In this article, we propose a new model‐free machine learning framework for risk classification and survival probability prediction based on weighted support vector machines. The new procedure does not require any specific parametric or semiparametric model assumption on data and is therefore capable of capturing non‐linear covariate effects. We use numerous simulation examples to demonstrate finite sample performance of the proposed method under various settings. Applications to a glioma tumour data and a breast cancer gene‐expression survival data are shown to illustrate the new methodology in real data analysis. Copyright © 2014 John Wiley & Sons, Ltd.}, number={1}, journal={Stat}, publisher={Wiley}, author={Geng, Yuan and Lu, Wenbin and Zhang, Hao Helen}, year={2014}, month={Mar}, pages={337–350} } @article{liu_lu_zhang_2014, title={Accelerated Intensity Frailty Model for Recurrent Events Data}, volume={70}, ISSN={["1541-0420"]}, DOI={10.1111/biom.12163}, abstractNote={SummaryIn this article we propose an accelerated intensity frailty (AIF) model for recurrent events data and derive a test for the variance of frailty. In addition, we develop a kernel‐smoothing‐based EM algorithm for estimating regression coefficients and the baseline intensity function. The variance of the resulting estimator for regression parameters is obtained by a numerical differentiation method. Simulation studies are conducted to evaluate the finite sample performance of the proposed estimator under practical settings and demonstrate the efficiency gain over the Gehan rank estimator based on the AFT model for counting process (Lin et al., 1998). Our method is further illustrated with an application to a bladder tumor recurrence data.}, number={3}, journal={BIOMETRICS}, author={Liu, Bo and Lu, Wenbin and Zhang, Jiajia}, year={2014}, month={Sep}, pages={579–587} } @article{song_lu_ma_jeng_2014, title={Censored rank independence screening for high-dimensional survival data}, volume={101}, ISSN={0006-3444 1464-3510}, url={http://dx.doi.org/10.1093/biomet/asu047}, DOI={10.1093/biomet/asu047}, abstractNote={In modern statistical applications, the dimension of covariates can be much larger than the sample size. In the context of linear models, correlation screening (Fan and Lv, 2008) has been shown to reduce the dimension of such data effectively while achieving the sure screening property, i.e., all of the active variables can be retained with high probability. However, screening based on the Pearson correlation does not perform well when applied to contaminated covariates and/or censored outcomes. In this paper, we study censored rank independence screening of high-dimensional survival data. The proposed method is robust to predictors that contain outliers, works for a general class of survival models, and enjoys the sure screening property. Simulations and an analysis of real data demonstrate that the proposed method performs competitively on survival data sets of moderate size and high-dimensional predictors, even when these are contaminated.}, number={4}, journal={Biometrika}, publisher={Oxford University Press (OUP)}, author={Song, R. and Lu, W. and Ma, S. and Jeng, X. (Jessie)}, year={2014}, month={Oct}, pages={799–814} } @article{guo_lu_li_2015, title={Forward Stagewise Shrinkage and Addition for High Dimensional Censored Regression}, volume={7}, ISSN={["1867-1772"]}, DOI={10.1007/s12561-014-9114-4}, abstractNote={Despite enormous development on variable selection approaches in recent years, modeling and selection of high dimensional censored regression remains a challenging question. When the number of predictors p far exceeds the number of observational units n and the outcome is censored, computations of existing solutions often become difficult, or even infeasible in some situations, while performances frequently deteriorate. In this article, we aim at simultaneous model estimation and variable selection for Cox proportional hazards models with high dimensional covariates. We propose a forward stage-wise shrinkage and addition approach for that purpose. Our proposal extends a popular statistical learning technique, the boosting method. It inherits the flexible nature of boosting and is straightforward to extend to nonlinear Cox models. Meanwhile it advances the classical boosting method by adding explicit variable selection and substantially reducing the number of iterations to the algorithm convergence. Our intensive simulations have showed that the new method enjoys a competitive performance in Cox models with both p < n and p ≥ n scenarios. The new method was also illustrated with analysis of two real microarray survival datasets.}, number={2}, journal={STATISTICS IN BIOSCIENCES}, author={Guo, Zifang and Lu, Wenbin and Li, Lexin}, year={2015}, month={Oct}, pages={225–244} } @article{tzeng_lu_hsu_2014, title={GENE-LEVEL PHARMACOGENETIC ANALYSIS ON SURVIVAL OUTCOMES USING GENE-TRAIT SIMILARITY REGRESSION}, volume={8}, ISSN={["1932-6157"]}, DOI={10.1214/14-aoas735}, abstractNote={Gene/pathway-based methods are drawing significant attention due to their usefulness in detecting rare and common variants that affect disease susceptibility. The biological mechanism of drug responses indicates that a gene-based analysis has even greater potential in pharmacogenetics. Motivated by a study from the Vitamin Intervention for Stroke Prevention (VISP) trial, we develop a gene-trait similarity regression for survival analysis to assess the effect of a gene or pathway on time-to-event outcomes. The similarity regression has a general framework that covers a range of survival models, such as the proportional hazards model and the proportional odds model. The inference procedure developed under the proportional hazards model is robust against model misspecification. We derive the equivalence between the similarity survival regression and a random effects model, which further unifies the current variance-component based methods. We demonstrate the effectiveness of the proposed method through simulation studies. In addition, we apply the method to the VISP trial data to identify the genes that exhibit an association with the risk of a recurrent stroke. TCN2 gene was found to be associated with the recurrent stroke risk in the low-dose arm. This gene may impact recurrent stroke risk in response to cofactor therapy.}, number={2}, journal={ANNALS OF APPLIED STATISTICS}, author={Tzeng, Jung-Ying and Lu, Wenbin and Hsu, Fang-Chi}, year={2014}, month={Jun}, pages={1232–1255} } @article{cai_wang_lu_zhang_2014, title={NPHMC: An R-package for estimating sample size of proportional hazards mixture cure model}, volume={113}, ISSN={["1872-7565"]}, DOI={10.1016/j.cmpb.2013.10.001}, abstractNote={Due to advances in medical research, more and more diseases can be cured nowadays, which largely increases the need for an easy-to-use software in calculating sample size of clinical trials with cure fractions. Current available sample size software, such as PROC POWER in SAS, Survival Analysis module in PASS, powerSurvEpi package in R are all based on the standard proportional hazards (PH) model which is not appropriate to design a clinical trial with cure fractions. Instead of the standard PH model, the PH mixture cure model is an important tool in handling the survival data with possible cure fractions. However, there are no tools available that can help design a trial with cure fractions. Therefore, we develop an R package NPHMC to determine the sample size needed for such study design.}, number={1}, journal={COMPUTER METHODS AND PROGRAMS IN BIOMEDICINE}, author={Cai, Chao and Wang, Songfeng and Lu, Wenbin and Zhang, Jiajia}, year={2014}, month={Jan}, pages={290–300} } @article{geng_zhang_lu_2015, title={On optimal treatment regimes selection for mean survival time}, volume={34}, ISSN={["1097-0258"]}, DOI={10.1002/sim.6397}, abstractNote={In clinical studies with time‐to‐event as a primary endpoint, one main interest is to find the best treatment strategy to maximize patients' mean survival time. Due to patient's heterogeneity in response to treatments, great efforts have been devoted to developing optimal treatment regimes by integrating individuals' clinical and genetic information. A main challenge arises in the selection of important variables that can help to build reliable and interpretable optimal treatment regimes as the dimension of predictors may be high. In this paper, we propose a robust loss‐based estimation framework that can be easily coupled with shrinkage penalties for both estimation of optimal treatment regimes and variable selection. The asymptotic properties of the proposed estimators are studied. Moreover, a model‐free estimator of restricted mean survival time under the derived optimal treatment regime is developed, and its asymptotic property is studied. Simulations are conducted to assess the empirical performance of the proposed method for parameter estimation, variable selection, and optimal treatment decision. An application to an AIDS clinical trial data set is given to illustrate the method. Copyright © 2014 John Wiley & Sons, Ltd.}, number={7}, journal={STATISTICS IN MEDICINE}, author={Geng, Yuan and Zhang, Hao Helen and Lu, Wenbin}, year={2015}, month={Mar}, pages={1169–1184} } @article{wang_zhang_lu_2014, title={Sample size calculation for the proportional hazards model with a time-dependent covariate}, volume={74}, ISSN={["1872-7352"]}, DOI={10.1016/j.csda.2014.01.018}, abstractNote={The Cox proportional hazards (PH) model with time-dependent covariates (referred to as the extended PH model) has been widely used in medical and health related studies to investigate the effects of time-varying risk factors on survival. Theories and practices regarding model estimation and fitting have been well developed for the extended PH model. However, little has been done regarding sample size calculations in survival studies involving a time-varying risk factor. A novel sample size formula based on the extended PH model is proposed by investigating the asymptotic distributions of the weighted log-rank statistics under the null and local alternative hypotheses. The derived sample size formula is an extension of the sample size formula for the standard Cox PH model. The performance of the proposed formula is evaluated by extensive simulations, and examples based on real data are given to illustrate the applications of the proposed methods.}, journal={COMPUTATIONAL STATISTICS & DATA ANALYSIS}, author={Wang, Songfeng and Zhang, Jiajia and Lu, Wenbin}, year={2014}, month={Jun}, pages={217–227} } @article{lu_liu_chen_2014, title={Testing Goodness-of-Fit for the Proportional Hazards Model based on Nested Case-Control Data}, volume={70}, ISSN={["1541-0420"]}, DOI={10.1111/biom.12239}, abstractNote={SummaryNested case–control sampling is a popular design for large epidemiological cohort studies due to its cost effectiveness. A number of methods have been developed for the estimation of the proportional hazards model with nested case–control data; however, the evaluation of modeling assumption is less attended. In this article, we propose a class of goodness‐of‐fit test statistics for testing the proportional hazards assumption based on nested case–control data. The test statistics are constructed based on asymptotically mean‐zero processes derived from Samuelsen's maximum pseudo‐likelihood estimation method. In addition, we develop an innovative resampling scheme to approximate the asymptotic distribution of the test statistics while accounting for the dependent sampling scheme of nested case–control design. Numerical studies are conducted to evaluate the performance of our proposed approach, and an application to the Wilms' Tumor Study is given to illustrate the methodology.}, number={4}, journal={BIOMETRICS}, author={Lu, Wenbin and Liu, Mengling and Chen, Yi-Hau}, year={2014}, month={Dec}, pages={845–851} } @article{kim_lu_sit_ying_2013, title={A Unified Approach to Semiparametric Transformation Models Under General Biased Sampling Schemes}, volume={108}, ISSN={["1537-274X"]}, DOI={10.1080/01621459.2012.746073}, abstractNote={We propose a unified estimation method for semiparametric linear transformation models under general biased sampling schemes. The new estimator is obtained from a set of counting process-based unbiased estimating equations, developed through introducing a general weighting scheme that offsets the sampling bias. The usual asymptotic properties, including consistency and asymptotic normality, are established under suitable regularity conditions. A closed-form formula is derived for the limiting variance and the plug-in estimator is shown to be consistent. We demonstrate the unified approach through the special cases of left truncation, length bias, the case-cohort design, and variants thereof. Simulation studies and applications to real datasets are presented.}, number={501}, journal={JOURNAL OF THE AMERICAN STATISTICAL ASSOCIATION}, author={Kim, Jane Paik and Lu, Wenbin and Sit, Tony and Ying, Zhiliang}, year={2013}, month={Mar}, pages={217–227} } @article{liu_lu_krogh_hallmans_clendenen_zeleniuch-jacquotte_2013, title={Estimation and selection of complex covariate effects in pooled nested case-control studies with heterogeneity}, volume={14}, ISSN={["1468-4357"]}, DOI={10.1093/biostatistics/kxt015}, abstractNote={A major challenge in cancer epidemiologic studies, especially those of rare cancers, is observing enough cases. To address this, researchers often join forces by bringing multiple studies together to achieve large sample sizes, allowing for increased power in hypothesis testing, and improved efficiency in effect estimation. Combining studies, however, renders the analysis difficult owing to the presence of heterogeneity in the pooled data. In this article, motivated by a collaborative nested case-control (NCC) study of ovarian cancer in three cohorts from United States, Sweden, and Italy, we investigate the use of penalty regularized partial likelihood estimation in the context of pooled NCC studies to achieve two goals. First, we propose an adaptive group lasso (gLASSO) penalized approach to simultaneously identify important variables and estimate their effects. Second, we propose a composite agLASSO penalized approach to identify variables with heterogeneous effects. Both methods are readily implemented with the group coordinate gradient decent algorithm and shown to enjoy the oracle property. We conduct simulation studies to evaluate the performance of our proposed approaches in finite samples under various heterogeneity settings, and apply them to the pooled ovarian cancer study.}, number={4}, journal={BIOSTATISTICS}, author={Liu, Mengling and Lu, Wenbin and Krogh, Vittorio and Hallmans, Goran and Clendenen, Tess V. and Zeleniuch-Jacquotte, Anne}, year={2013}, month={Sep}, pages={682–694} } @article{liu_lu_zhang_2013, title={Kernel smoothed profile likelihood estimation in the accelerated failure time frailty model for clustered survival data}, volume={100}, ISSN={["0006-3444"]}, DOI={10.1093/biomet/ast012}, abstractNote={Clustered survival data frequently arise in biomedical applications, where event times of interest are clustered into groups such as families. In this article we consider an accelerated failure time frailty model for clustered survival data and develop nonparametric maximum likelihood estimation for it via a kernel smoother aided EM algorithm. We show that the proposed estimator for the regression coefficients is consistent, asymptotically normal and semiparametric efficient when the kernel bandwidth is properly chosen. An EM-aided numerical differentiation method is derived for estimating its variance. Simulation studies evaluate the finite sample performance of the estimator, and it is applied to the Diabetic Retinopathy data set.}, number={3}, journal={BIOMETRIKA}, author={Liu, Bo and Lu, Wenbin and Zhang, Jiajia}, year={2013}, month={Sep}, pages={741–755} } @article{kim_cai_lu_2013, title={More efficient estimators for case-cohort studies}, volume={100}, ISSN={["0006-3444"]}, DOI={10.1093/biomet/ast018}, abstractNote={The case-cohort study design, used to reduce costs in large cohort studies, is a random sample of the entire cohort, named the subcohort, augmented with subjects having the disease of interest but not in the subcohort sample. When several diseases are of interest, several case-cohort studies may be conducted using the same subcohort, with each disease analyzed separately, ignoring the additional exposure measurements collected on subjects with the other diseases. This is not an efficient use of the data, and in this paper, we propose more efficient estimators. We consider both joint and separate analyses for the multiple diseases. We propose an estimating equation approach with a new weight function, and we establish the consistency and asymptotic normality of the resulting estimator. Simulation studies show that the proposed methods using all available information gain efficiency. We apply our proposed method to the data from the Busselton Health Study.}, number={3}, journal={BIOMETRIKA}, author={Kim, S. and Cai, J. and Lu, W.}, year={2013}, month={Sep}, pages={695–708} } @article{shang_liu_zeleniuch-jacquotte_clendenen_krogh_hallmans_lu_2013, title={Partially linear single index Cox regression model in nested case-control studies}, volume={67}, ISSN={["1872-7352"]}, DOI={10.1016/j.csda.2013.05.011}, abstractNote={The nested case-control (NCC) design is widely used in epidemiologic studies as a cost-effective subcohort sampling method to study the association between a disease and its potential risk factors. NCC data are commonly analyzed using Thomas' partial likelihood approach under the Cox proportional hazards model assumption. However, the linear modeling form in the Cox model may be insufficient for practical applications, especially when there are a large number of risk factors under investigation. In this paper, we consider a partially linear single index proportional hazard model, which includes a linear component for covariates of interest to yield easily interpretable results and a nonparametric single index component to adjust for multiple confounders effectively. We propose to approximate the nonparametric single index function by polynomial splines and estimate the parameters of interest using an iterative algorithm based on the partial likelihood. Asymptotic properties of the resulting estimators are established. The proposed methods are evaluated using simulations and applied to an NCC study of ovarian cancer.}, journal={COMPUTATIONAL STATISTICS & DATA ANALYSIS}, author={Shang, Shulian and Liu, Mengling and Zeleniuch-Jacquotte, Anne and Clendenen, Tess V. and Krogh, Vittorio and Hallmans, Goran and Lu, Wenbin}, year={2013}, month={Nov}, pages={199–212} } @article{yan_zhang_lu_grifo_liu_2012, title={A Semi-nonparametric Approach to Joint Modeling of A Primary Binary Outcome and Longitudinal Data Measured at Discrete Informative Times}, volume={4}, ISSN={1867-1764 1867-1772}, url={http://dx.doi.org/10.1007/S12561-011-9053-2}, DOI={10.1007/S12561-011-9053-2}, abstractNote={In a study conducted at the New York University Fertility Center, one of the scientific objectives is to investigate the relationship between the final pregnancy outcomes of participants receiving an in vitro fertilization (IVF) treatment and their β-human chorionic gonadotrophin (β-hCG) profiles. A common joint modeling approach to this objective is to use subject-specific normal random effects in a linear mixed model for longitudinal β-hCG data as predictors in a model (e.g., logistic model) for the final pregnancy outcome. Empirical data exploration indicates that the observation times for longitudinal β-hCG data may be informative and the distribution of random effects for longitudinal β-hCG data may not be normally distributed. We propose to introduce a third model in the joint model for the informative β-hCG observation times, and relax the normality distributional assumption of random effects using the semi-nonparametric (SNP) approach of Gallant and Nychka (Econometrica 55:363–390, 1987). An EM algorithm is developed for parameter estimation. Extensive simulation designed to evaluate the proposed method indicates that ignoring either informative observation times or distributional assumption of the random effects would lead to invalid and/or inefficient inference. Applying our new approach to the data reveals some interesting findings the traditional approach failed to discover.}, number={2}, journal={Statistics in Biosciences}, publisher={Springer Science and Business Media LLC}, author={Yan, Song and Zhang, Daowen and Lu, Wenbin and Grifo, James A. and Liu, Mengling}, year={2012}, month={Jan}, pages={213–234} } @article{lu_goldberg_fine_2012, title={On the robustness of the adaptive lasso to model misspecification}, volume={99}, ISSN={["1464-3510"]}, DOI={10.1093/biomet/ass027}, abstractNote={Penalization methods have been shown to yield both consistent variable selection and oracle parameter estimation under correct model specification. In this article, we study such methods under model misspecification, where the assumed form of the regression function is incorrect, including generalized linear models for uncensored outcomes and the proportional hazards model for censored responses. Estimation with the adaptive least absolute shrinkage and selection operator, lasso, penalty is proven to achieve sparse estimation of regression coefficients under misspecification. The resulting estimators are selection consistent, asymptotically normal and oracle, where the selection is based on the limiting values of the parameter estimators obtained using the misspecified model without penalization. We further derive conditions under which the penalized estimators from the misspecified model may yield selection consistency under the true model. The robustness is explored numerically via simulation and an application to the Wisconsin Epidemiological Study of Diabetic Retinopathy.}, number={3}, journal={BIOMETRIKA}, author={Lu, W. and Goldberg, Y. and Fine, J. P.}, year={2012}, month={Sep}, pages={717–731} } @article{sui_zhang_lee_church_lu_liu_blair_2013, title={Physical activity/fitness peaks during perimenopause and BMI change patterns are not associated with baseline activity/fitness in women: a longitudinal study with a median 7-year follow-up}, volume={47}, DOI={10.1136/bjsports-2011-090888}, abstractNote={Objective To assess the age-associated longitudinal trends in cardiorespiratory fitness (CRF), leisure-time physical activity (PA), and body mass index (BMI) across the lifespan in a cohort of adult women. Methods The sample included 1467 women from the Aerobics Center Longitudinal Study who were 30–79 years old at baseline and had 3–22 health examinations between 1971 and 2006. CRF was quantified by maximal Balke treadmill exercise tests. The total metabolic equivalent-minutes/week of self-reported PA and measured BMI (kg/m2) were calculated. Results The overall pattern of CRF decreased over time. After age 60 years, fitness level tended to decline rapidly. Women at age 50 had the highest PA level, which decreased after age 50 and plateaued at age 60. The overall pattern of BMI increased with age. However, after age 60 years the rate of increase in BMI became much slower. Adjusting for smoking, health status, and the individual exposures of CRF, PA and BMI did not influence the observed associations. Women who did not meet current PA recommendation or those who were low fit at baseline had a higher BMI throughout adulthood than their more active or fit peers, but the trajectory of BMI was unassociated with baseline activity or fitness levels. Conclusion We concluded that the age-related longitudinal patterns in physical activity and fitness are not linear. Baseline activity and fitness levels are associated with BMI status during adulthood, but do not affect BMI change trajectory.}, number={2}, journal={British Journal of Sports Medicine}, author={Sui, X. M. and Zhang, J. J. and Lee, D. C. and Church, T. S. and Lu, W. B. and Liu, J. X. and Blair, S. N.}, year={2013}, pages={77–82} } @article{wang_zhang_lu_2012, title={Sample size calculation for the proportional hazards cure model}, volume={31}, ISSN={["1097-0258"]}, DOI={10.1002/sim.5465}, abstractNote={In clinical trials with time‐to‐event endpoints, it is not uncommon to see a significant proportion of patients being cured (or long‐term survivors), such as trials for the non‐Hodgkins lymphoma disease. The popularly used sample size formula derived under the proportional hazards (PH) model may not be proper to design a survival trial with a cure fraction, because the PH model assumption may be violated. To account for a cure fraction, the PH cure model is widely used in practice, where a PH model is used for survival times of uncured patients and a logistic distribution is used for the probability of patients being cured. In this paper, we develop a sample size formula on the basis of the PH cure model by investigating the asymptotic distributions of the standard weighted log‐rank statistics under the null and local alternative hypotheses. The derived sample size formula under the PH cure model is more flexible because it can be used to test the differences in the short‐term survival and/or cure fraction. Furthermore, we also investigate as numerical examples the impacts of accrual methods and durations of accrual and follow‐up periods on sample size calculation. The results show that ignoring the cure rate in sample size calculation can lead to either underpowered or overpowered studies. We evaluate the performance of the proposed formula by simulation studies and provide an example to illustrate its application with the use of data from a melanoma trial. Copyright © 2012 John Wiley & Sons, Ltd.}, number={29}, journal={STATISTICS IN MEDICINE}, author={Wang, Songfeng and Zhang, Jiajia and Lu, Wenbin}, year={2012}, month={Dec}, pages={3959–3971} } @article{pang_lu_wang_2012, title={Variance estimation in censored quantile regression via induced smoothing}, volume={56}, ISSN={["1872-7352"]}, DOI={10.1016/j.csda.2010.10.018}, abstractNote={Statistical inference in censored quantile regression is challenging, partly due to the unsmoothness of the quantile score function. A new procedure is developed to estimate the variance of Bang and Tsiatis's inverse-censoring-probability weighted estimator for censored quantile regression by employing the idea of induced smoothing. The proposed variance estimator is shown to be asymptotically consistent. In addition, numerical study suggests that the proposed procedure performs well in finite samples, and it is computationally more efficient than the commonly used bootstrap method.}, number={4}, journal={COMPUTATIONAL STATISTICS & DATA ANALYSIS}, author={Pang, Lei and Lu, Wenbin and Wang, Huixia Judy}, year={2012}, month={Apr}, pages={785–796} } @article{lu_liu_2012, title={On estimation of linear transformation models with nested case-control sampling}, volume={18}, ISSN={["1380-7870"]}, DOI={10.1007/s10985-011-9203-3}, abstractNote={Nested case–control (NCC) sampling is widely used in large epidemiological cohort studies for its cost effectiveness, but its data analysis primarily relies on the Cox proportional hazards model. In this paper, we consider a family of linear transformation models for analyzing NCC data and propose an inverse selection probability weighted estimating equation method for inference. Consistency and asymptotic normality of our estimators for regression coefficients are established. We show that the asymptotic variance has a closed analytic form and can be easily estimated. Numerical studies are conducted to support the theory and an application to the Wilms’ Tumor Study is also given to illustrate the methodology.}, number={1}, journal={LIFETIME DATA ANALYSIS}, author={Lu, Wenbin and Liu, Mengling}, year={2012}, month={Jan}, pages={80–93} } @article{lu_zhang_zeng_2013, title={Variable selection for optimal treatment decision}, volume={22}, ISSN={["1477-0334"]}, DOI={10.1177/0962280211428383}, abstractNote={ In decision-making on optimal treatment strategies, it is of great importance to identify variables that are involved in the decision rule, i.e. those interacting with the treatment. Effective variable selection helps to improve the prediction accuracy and enhance the interpretability of the decision rule. We propose a new penalized regression framework which can simultaneously estimate the optimal treatment strategy and identify important variables. The advantages of the new approach include: (i) it does not require the estimation of the baseline mean function of the response, which greatly improves the robustness of the estimator; (ii) the convenient loss-based framework makes it easier to adopt shrinkage methods for variable selection, which greatly facilitates implementation and statistical inferences for the estimator. The new procedure can be easily implemented by existing state-of-art software packages like LARS. Theoretical properties of the new estimator are studied. Its empirical performance is evaluated using simulation studies and further illustrated with an application to an AIDS clinical trial. }, number={5}, journal={STATISTICAL METHODS IN MEDICAL RESEARCH}, author={Lu, Wenbin and Zhang, Hao Helen and Zeng, Donglin}, year={2013}, month={Oct}, pages={493–504} } @article{liu_lu_tseng_2010, title={Cox Regression in Nested Case-Control Studies with Auxiliary Covariates}, volume={66}, ISSN={["0006-341X"]}, DOI={10.1111/j.1541-0420.2009.01277.x}, abstractNote={Summary Nested case–control (NCC) design is a popular sampling method in large epidemiological studies for its cost effectiveness to investigate the temporal relationship of diseases with environmental exposures or biological precursors. Thomas' maximum partial likelihood estimator is commonly used to estimate the regression parameters in Cox's model for NCC data. In this article, we consider a situation in which failure/censoring information and some crude covariates are available for the entire cohort in addition to NCC data and propose an improved estimator that is asymptotically more efficient than Thomas' estimator. We adopt a projection approach that, heretofore, has only been employed in situations of random validation sampling and show that it can be well adapted to NCC designs where the sampling scheme is a dynamic process and is not independent for controls. Under certain conditions, consistency and asymptotic normality of the proposed estimator are established and a consistent variance estimator is also developed. Furthermore, a simplified approximate estimator is proposed when the disease is rare. Extensive simulations are conducted to evaluate the finite sample performance of our proposed estimators and to compare the efficiency with Thomas' estimator and other competing estimators. Moreover, sensitivity analyses are conducted to demonstrate the behavior of the proposed estimator when model assumptions are violated, and we find that the biases are reasonably small in realistic situations. We further demonstrate the proposed method with data from studies on Wilms' tumor.}, number={2}, journal={BIOMETRICS}, author={Liu, Mengling and Lu, Wenbin and Tseng, Chi-hong}, year={2010}, month={Jun}, pages={374–381} } @article{liu_lu_shore_zeleniuch-jacquotte_2010, title={Cox regression model with time-varying coefficients in nested case-control studies}, volume={11}, ISSN={["1465-4644"]}, DOI={10.1093/biostatistics/kxq037}, abstractNote={The nested case-control (NCC) design is a cost-effective sampling method to study the relationship between a disease and its risk factors in epidemiologic studies. NCC data are commonly analyzed using Thomas' partial likelihood approach under Cox's proportional hazards model with constant covariate effects. Here, we are interested in studying the potential time-varying effects of covariates in NCC studies and propose an estimation approach based on a kernel-weighted Thomas' partial likelihood. We establish asymptotic properties of the proposed estimator, propose a numerical approach to construct simultaneous confidence bands for time-varying coefficients, and develop a hypothesis testing procedure to detect time-varying coefficients. The proposed inference procedure is evaluated in simulations and applied to an NCC study of breast cancer in the New York University Women's Health Study.}, number={4}, journal={BIOSTATISTICS}, author={Liu, Mengling and Lu, Wenbin and Shore, Roy E. and Zeleniuch-Jacquotte, Anne}, year={2010}, month={Oct}, pages={693–706} } @article{lu_2010, title={Efficient estimation for an accelerated failure time model with a cure fraction}, volume={20}, number={2}, journal={Statistica Sinica}, author={Lu, W. B.}, year={2010}, pages={661–674} } @inproceedings{cai_chow_lu_li_2010, title={Evaluation of distribution fault diagnosis algorithms using ROC curves}, DOI={10.1109/pes.2010.5588154}, abstractNote={In power distribution fault data, the percentage of faults with different causes could be very different and varies from region to region. This data imbalance issue seriously affects the performance evaluation of fault diagnosis algorithms. Due to the limitations of conventional accuracy (ACC) and geometric mean (G-mean) measures, this paper discusses the application of Receiver Operating Characteristic (ROC) curves in evaluating distribution fault diagnosis performance. After introducing how to obtain ROC curves, Artificial Neural Networks (ANN), Logistic Regression (LR), Support Vector Machines (SVM), Artificial Immune Recognition Systems (AIRS), and K-Nearest Neighbor (KNN) algorithm are compared using ROC curves and Area Under the Curve (AUC) on real-world fault datasets from Progress Energy Carolinas. Experimental results show that AIRS performs best most of the time and ANN is potentially a good algorithm with a proper decision threshold.}, booktitle={Ieee power and energy soceity general meeting 2010}, author={Cai, Y. X. and Chow, M. Y. and Lu, W. B. and Li, L. X.}, year={2010} } @article{tzeng_lu_farmen_liu_sullivan_2010, title={Haplotype-Based Pharmacogenetic Analysis for Longitudinal Quantitative Traits in the Presence of Dropout}, volume={20}, ISSN={["1520-5711"]}, DOI={10.1080/10543400903572787}, abstractNote={We propose a variety of methods based on the generalized estimation equations to address the issues encountered in haplotype-based pharmacogenetic analysis, including analysis of longitudinal data with outcome-dependent dropouts, and evaluation of the high-dimensional haplotype and haplotype–drug interaction effects in an overall manner. We use the inverse probability weights to handle the outcome-dependent dropouts under the missing-at-random assumption, and incorporate the weighted L 1 penalty to select important main and interaction effects with high dimensionality. The proposed methods are easy to implement, computationally efficient, and provide an optimal balance between false positives and false negatives in detecting genetic effects.}, number={2}, journal={JOURNAL OF BIOPHARMACEUTICAL STATISTICS}, author={Tzeng, Jung-Ying and Lu, Wenbin and Farmen, Mark W. and Liu, Youfang and Sullivan, Patrick F.}, year={2010}, pages={334–350} } @article{lu_zhang_2010, title={On Estimation of Partially Linear Transformation Models}, volume={105}, ISSN={["1537-274X"]}, DOI={10.1198/jasa.2010.tm09302}, abstractNote={We study a general class of partially linear transformation models, which extend linear transformation models by incorporating nonlinear covariate effects in survival data analysis. A new martingale-based estimating equation approach, consisting of both global and kernel-weighted local estimation equations, is developed for estimating the parametric and nonparametric covariate effects in a unified manner. We show that with a proper choice of the kernel bandwidth parameter, one can obtain the consistent and asymptotically normal parameter estimates for the linear effects. Asymptotic properties of the estimated nonlinear effects are established as well. We further suggest a simple resampling method to estimate the asymptotic variance of the linear estimates and show its effectiveness. To facilitate the implementation of the new procedure, an iterative algorithm is developed. Numerical examples are given to illustrate the finite-sample performance of the procedure. Supplementary materials are available online.}, number={490}, journal={JOURNAL OF THE AMERICAN STATISTICAL ASSOCIATION}, author={Lu, Wenbin and Zhang, Hao Helen}, year={2010}, month={Jun}, pages={683–691} } @article{zhang_lu_wang_2010, title={On sparse estimation for semiparametric linear transformation models}, volume={101}, ISSN={["0047-259X"]}, DOI={10.1016/j.jmva.2010.01.015}, abstractNote={Semiparametric linear transformation models have received much attention due to their high flexibility in modeling survival data. A useful estimating equation procedure was recently proposed by Chen et al. (2002) [21] for linear transformation models to jointly estimate parametric and nonparametric terms. They showed that this procedure can yield a consistent and robust estimator. However, the problem of variable selection for linear transformation models has been less studied, partially because a convenient loss function is not readily available under this context. In this paper, we propose a simple yet powerful approach to achieve both sparse and consistent estimation for linear transformation models. The main idea is to derive a profiled score from the estimating equation of Chen et al. [21], construct a loss function based on the profile scored and its variance, and then minimize the loss subject to some shrinkage penalty. Under regularity conditions, we have shown that the resulting estimator is consistent for both model estimation and variable selection. Furthermore, the estimated parametric terms are asymptotically normal and can achieve a higher efficiency than that yielded from the estimation equations. For computation, we suggest a one-step approximation algorithm which can take advantage of the LARS and build the entire solution path efficiently. Performance of the new procedure is illustrated through numerous simulations and real examples including one microarray data.}, number={7}, journal={JOURNAL OF MULTIVARIATE ANALYSIS}, author={Zhang, Hao Helen and Lu, Wenbin and Wang, Hansheng}, year={2010}, month={Aug}, pages={1594–1606} } @article{shows_lu_zhang_2010, title={Sparse estimation and inference for censored median regression}, volume={140}, ISSN={["1873-1171"]}, DOI={10.1016/j.jspi.2010.01.043}, abstractNote={Censored median regression has proved useful for analyzing survival data in complicated situations, say, when the variance is heteroscedastic or the data contain outliers. In this paper, we study the sparse estimation for censored median regression models, which is an important problem for high dimensional survival data analysis. In particular, a new procedure is proposed to minimize an inverse-censoring-probability weighted least absolute deviation loss subject to the adaptive LASSO penalty and result in a sparse and robust median estimator. We show that, with a proper choice of the tuning parameter, the procedure can identify the underlying sparse model consistently and has desired large-sample properties including root-n consistency and the asymptotic normality. The procedure also enjoys great advantages in computation, since its entire solution path can be obtained efficiently. Furthermore, we propose a resampling method to estimate the variance of the estimator. The performance of the procedure is illustrated by extensive simulations and two real data applications including one microarray gene expression survival data.}, number={7}, journal={JOURNAL OF STATISTICAL PLANNING AND INFERENCE}, author={Shows, Justin Hall and Lu, Wenbin and Zhang, Hao Helen}, year={2010}, month={Jul}, pages={1903–1917} } @article{lu_li_2011, title={Sufficient Dimension Reduction for Censored Regressions}, volume={67}, ISSN={["1541-0420"]}, DOI={10.1111/j.1541-0420.2010.01490.x}, abstractNote={Summary Methodology of sufficient dimension reduction (SDR) has offered an effective means to facilitate regression analysis of high‐dimensional data. When the response is censored, however, most existing SDR estimators cannot be applied, or require some restrictive conditions. In this article, we propose a new class of inverse censoring probability weighted SDR estimators for censored regressions. Moreover, regularization is introduced to achieve simultaneous variable selection and dimension reduction. Asymptotic properties and empirical performance of the proposed methods are examined.}, number={2}, journal={BIOMETRICS}, author={Lu, Wenbin and Li, Lexin}, year={2011}, month={Jun}, pages={513–523} } @article{wenbin_2009, title={Efficiency comparison between mean and log-rank tests for recurrent event time data}, volume={52}, ISSN={["1006-9283"]}, DOI={10.1007/s11425-009-0059-x}, number={6}, journal={SCIENCE IN CHINA SERIES A-MATHEMATICS}, author={WenBin, Lu}, year={2009}, month={Jun}, pages={1169–1180} } @article{liang_lu_ying_2009, title={Joint Modeling and Analysis of Longitudinal Data with Informative Observation Times}, volume={65}, ISSN={["1541-0420"]}, DOI={10.1111/j.1541-0420.2008.01104.x}, abstractNote={Summary In analysis of longitudinal data, it is often assumed that observation times are predetermined and are the same across study subjects. Such an assumption, however, is often violated in practice. As a result, the observation times may be highly irregular. It is well known that if the sampling scheme is correlated with the outcome values, the usual statistical analysis may yield bias. In this article, we propose joint modeling and analysis of longitudinal data with possibly informative observation times via latent variables. A two‐step estimation procedure is developed for parameter estimation. We show that the resulting estimators are consistent and asymptotically normal, and that the asymptotic variance can be consistently estimated using the bootstrap method. Simulation studies and a real data analysis demonstrate that our method performs well with realistic sample sizes and is appropriate for practical use.}, number={2}, journal={BIOMETRICS}, author={Liang, Yu and Lu, Wenbin and Ying, Zhiliang}, year={2009}, month={Jun}, pages={377–384} } @article{liu_lu_shao_2008, title={A Monte Carlo approach for change-point detection in the Cox proportional hazards model}, volume={27}, ISSN={["0277-6715"]}, DOI={10.1002/sim.3214}, abstractNote={AbstractDetecting a time lag of treatment effect or identifying change points in a hazard function is of great interest and importance in survival analysis. The testing procedures hereto are primarily based on analytical approximations for the asymptotic null distribution of either the likelihood ratio test or the score test. In the presence of random censoring and/or covariates, however, the justification for the limiting distribution often requires some technical assumptions and conditions that are difficult to verify in practice. Moreover, a satisfactory asymptotic theory for testing the existence of multiple change points in hazard function has not emerged. In this paper, we consider maximal score tests for detecting change point(s) in the Cox proportional hazards model with censored data. We propose to use a simple Monte Carlo approach for assessing the statistical significance of tests. The proposed approach is applicable for testing a single change point in the Cox model with covariates and sample stratifications over various types of candidate regions, including discrete time‐point sets or disjoint intervals. We also show that the proposed test statistics and the Monte Carlo procedure are well applicable under situations with multiple change points. Simulation studies and an analysis of a real data from a randomized cancer trial are conducted to demonstrate the finite‐sample performance of the proposed approach. Copyright © 2008 John Wiley & Sons, Ltd.}, number={19}, journal={STATISTICS IN MEDICINE}, author={Liu, Mengling and Lu, Wenbin and Shao, Yongzhao}, year={2008}, month={Aug}, pages={3894–3909} } @article{lu_liang_2008, title={Analysis of competing risks data with missing cause of failure under additive hazards model}, volume={18}, number={1}, journal={Statistica Sinica}, author={Lu, W. B. and Liang, Y.}, year={2008}, pages={219–234} } @article{lu_li_2008, title={Boosting method for nonlinear transformation models with censored survival data}, volume={9}, ISSN={["1465-4644"]}, DOI={10.1093/biostatistics/kxn005}, abstractNote={We propose a general class of nonlinear transformation models for analyzing censored survival data, of which the nonlinear proportional hazards and proportional odds models are special cases. A cubic smoothing spline-based component-wise boosting algorithm is derived to estimate covariate effects nonparametrically using the gradient of the marginal likelihood, that is computed using importance sampling. The proposed method can be applied to survival data with high-dimensional covariates, including the case when the sample size is smaller than the number of predictors. Empirical performance of the proposed method is evaluated via simulations and analysis of a microarray survival data.}, number={4}, journal={BIOSTATISTICS}, author={Lu, Wenbin and Li, Lexin}, year={2008}, month={Oct}, pages={658–667} } @article{lu_peng_2008, title={Semiparametric analysis of mixture regression models with competing risks data}, volume={14}, ISSN={["1380-7870"]}, DOI={10.1007/s10985-007-9077-6}, abstractNote={In the analysis of competing risks data, cumulative incidence function is a useful summary of the overall crude risk for a failure type of interest. Mixture regression modeling has served as a natural approach to performing covariate analysis based on this quantity. However, existing mixture regression methods with competing risks data either impose parametric assumptions on the conditional risks or require stringent censoring assumptions. In this article, we propose a new semiparametric regression approach for competing risks data under the usual conditional independent censoring mechanism. We establish the consistency and asymptotic normality of the resulting estimators. A simple resampling method is proposed to approximate the distribution of the estimated parameters and that of the predicted cumulative incidence functions. Simulation studies and an analysis of a breast cancer dataset demonstrate that our method performs well with realistic sample sizes and is appropriate for practical use.}, number={3}, journal={LIFETIME DATA ANALYSIS}, author={Lu, Wenbin and Peng, Limin}, year={2008}, month={Sep}, pages={231–252} } @article{li_lu_2008, title={Sufficient dimension reduction with missing predictors}, volume={103}, ISSN={["0162-1459"]}, DOI={10.1198/016214508000000283}, abstractNote={In high-dimensional data analysis, sufficient dimension reduction (SDR) methods are effective in reducing the predictor dimension, while retaining full regression information and imposing no parametric models. However, it is common in high-dimensional data that a subset of predictors may have missing observations. Existing SDR methods resort to the complete-case analysis by removing all the subjects with missingness in any of the predictors under inquiry. Such an approach does not make effective use of the data and is valid only when missingness is independent of both observed and unobserved quantities. In this article, we propose a new class of SDR estimators under a more general missingness mechanism that allows missingness to depend on the observed data. We focus on a widely used SDR method, sliced inverse regression, and propose an augmented inverse probability weighted sliced inverse regression estimator (AIPW–SIR). We show that AIPW–SIR is doubly robust and asymptotically consistent and demonstrate that AIPW–SIR is more effective than the complete-case analysis through both simulations and real data analysis. We also outline the extension of the AIPW strategy to other SDR methods, including sliced average variance estimation and principal Hessian directions.}, number={482}, journal={JOURNAL OF THE AMERICAN STATISTICAL ASSOCIATION}, author={Li, Lexin and Lu, Wenbin}, year={2008}, month={Jun}, pages={822–831} } @article{zhang_lu_2007, title={Adaptive lasso for Cox's proportional hazards model}, volume={94}, ISSN={["0006-3444"]}, DOI={10.1093/biomet/asm037}, abstractNote={We investigate the variable selection problem for Cox's proportional hazards model, and propose a unified model selection and estimation procedure with desired theoretical properties and computational convenience. The new method is based on a penalized log partial likelihood with the adaptively weighted L 1 penalty on regression coefficients, providing what we call the adaptive Lasso estimator. The method incorporates different penalties for different coefficients: unimportant variables receive larger penalties than important ones, so that important variables tend to be retained in the selection process, whereas unimportant variables are more likely to be dropped. Theoretical properties, such as consistency and rate of convergence of the estimator, are studied. We also show that, with proper choice of regularization parameters, the proposed estimator has the oracle properties. The convex optimization nature of the method leads to an efficient algorithm. Both simulated and real examples show that the method performs competitively. Copyright 2007, Oxford University Press.}, number={3}, journal={BIOMETRIKA}, author={Zhang, Hao Helen and Lu, Wenbin}, year={2007}, month={Aug}, pages={691–703} } @article{bernholc_lu_nakhmanson_hahn_meunier_nardelli_schmidt_2007, title={Atomic scale design of nanostructures}, volume={105}, ISSN={["1362-3028"]}, DOI={10.1080/00268970701189186}, abstractNote={Recent advances in theoretical methods and high performance computing allow for reliable first-principles predictions of complex nanostructured materials and devices. This paper describes three examples: (i) non-equilibrium electron transport through molecular junctions, as a stepping stone for the design of molecular-scale devices and for integration of biomolecules with Si technology; (ii) polarization and piezoelectric properties of PVDF and related polymers; and (iii) the many-body optical spectrum of water. For the molecular junction, our results provide a qualitative picture and quantitative understanding of the mechanism leading to negative differential resistance for a large class of small molecules. For ferroelectric polymers, the calculations show that their polarization is described by cooperative, quantum-mechanical interactions between polymer chains. Nevertheless, the ab initio results lead to a simple parameterization of polarization as a function of copolymer concentration. Finally, our calculations explain the well-known redshift in the fundamental absorption of water as due to exciton delocalization upon aggregation.}, number={2-3}, journal={MOLECULAR PHYSICS}, author={Bernholc, J. and Lu, W. and Nakhmanson, S. M. and Hahn, P. H. and Meunier, V. and Nardelli, M. Buongiorno and Schmidt, W. G.}, year={2007}, pages={147–156} } @article{lu_2008, title={Maximum likelihood estimation in the proportional hazards cure model}, volume={60}, ISSN={["1572-9052"]}, DOI={10.1007/s10463-007-0120-x}, number={3}, journal={ANNALS OF THE INSTITUTE OF STATISTICAL MATHEMATICS}, author={Lu, Wenbin}, year={2008}, month={Sep}, pages={545–574} } @article{lu_2007, title={Tests of independence for censored bivariate failure time data}, volume={13}, ISSN={["1572-9249"]}, DOI={10.1007/s10985-006-9031-z}, abstractNote={Bivariate failure time data is widely used in survival analysis, for example, in twins study. This article presents a class of chi2-type tests for independence between pairs of failure times after adjusting for covariates. A bivariate accelerated failure time model is proposed for the joint distribution of bivariate failure times while leaving the dependence structures for related failure times completely unspecified. Theoretical properties of the proposed tests are derived and variance estimates of the test statistics are obtained using a resampling technique. Simulation studies show that the proposed tests are appropriate for practical use. Two examples including the study of infection in catheters for patients on dialysis and the diabetic retinopathy study are also given to illustrate the methodology.}, number={1}, journal={LIFETIME DATA ANALYSIS}, author={Lu, Wenbin}, year={2007}, month={Mar}, pages={75–90} } @article{lu_zhang_2007, title={Variable selection for proportional odds model}, volume={26}, ISSN={["1097-0258"]}, DOI={10.1002/sim.2833}, abstractNote={AbstractIn this paper we study the problem of variable selection for the proportional odds model, which is a useful alternative to the proportional hazards model and might be appropriate when the proportional hazards assumption is not satisfied. We propose to fit the proportional odds model by maximizing the marginal likelihood subject to a shrinkage‐type penalty, which encourages sparse solutions and hence facilitates the process of variable selection. Two types of shrinkage penalties are considered: the LASSO and the adaptive‐LASSO (ALASSO) penalty. In the ALASSO penalty, different weights are imposed on different coefficients such that important variables are more protectively retained in the final model while unimportant ones are more likely to be shrunk to zeros. We further provide an efficient computation algorithm to implement the proposed methods, and demonstrate their performance through simulation studies and an application to real data. Numerical results indicate that both methods can produce accurate and interpretable models, and the ALASSO tends to work better than the usual LASSO. Copyright © 2007 John Wiley & Sons, Ltd.}, number={20}, journal={STATISTICS IN MEDICINE}, author={Lu, Wenbin and Zhang, Hao H.}, year={2007}, month={Sep}, pages={3771–3781} } @article{lu_liang_2006, title={Empirical likelihood inference for linear transformation models}, volume={97}, ISSN={["0047-259X"]}, DOI={10.1016/j.jmva.2005.09.007}, abstractNote={Empirical likelihood inference is developed for censored survival data under the linear transformation models, which generalize Cox's [Regression models and life tables (with Discussion), J. Roy. Statist. Soc. Ser. B 34 (1972) 187–220] proportional hazards model. We show that the limiting distribution of the empirical likelihood ratio is a weighted sum of standard chi-squared distribution. Empirical likelihood ratio tests for the regression parameters with and without covariate adjustments are also derived. Simulation studies suggest that the empirical likelihood ratio tests are more accurate (under the null hypothesis) and powerful (under the alternative hypothesis) than the normal approximation based tests of Chen et al. [Semiparametric of transformation models with censored data, Biometrika 89 (2002) 659–668] when the model is different from the proportional hazards model and the proportion of censoring is high.}, number={7}, journal={JOURNAL OF MULTIVARIATE ANALYSIS}, author={Lu, WB and Liang, Y}, year={2006}, month={Aug}, pages={1586–1599} } @article{liu_lu_shao_2006, title={Interval mapping of quantitative trait loci for time-to-event data with the proportional hazards mixture cure model}, volume={62}, ISSN={["0006-341X"]}, DOI={10.1111/j.1541-0420.2006.00585.x}, abstractNote={Summary Interval mapping using normal mixture models has been an important tool for analyzing quantitative traits in experimental organisms. When the primary phenotype is time‐to‐event, it is natural to use survival models such as Cox's proportional hazards model instead of normal mixtures to model the phenotype distribution. An extra challenge for modeling time‐to‐event data is that the underlying population may consist of susceptible and nonsusceptible subjects. In this article, we propose a semiparametric proportional hazards mixture cure model which allows missing covariates. We discuss applications to quantitative trait loci (QTL) mapping when the primary trait is time‐to‐event from a population of mixed susceptibility. This model can be used to characterize QTL effects on both susceptibility and time‐to‐event distribution, and to estimate QTL location. The model can naturally incorporate covariate effects of other risk factors. Maximum likelihood estimates for the parameters in the model as well as their corresponding variance estimates can be obtained numerically using an EM‐type algorithm. The proposed methods are assessed by simulations under practical settings and illustrated using a real data set containing survival times of mice after infection with Listeria monocytogenes. An extension to multiple intervals is also discussed.}, number={4}, journal={BIOMETRICS}, author={Liu, Mengling and Lu, Wenbin and Shao, Yongzhao}, year={2006}, month={Dec}, pages={1053–1061} } @article{liu_lu_shao_2006, title={Mixture cure model with an application to interval mapping of quantitative trait loci}, volume={12}, ISSN={["1572-9249"]}, DOI={10.1007/s10985-006-9025-x}, abstractNote={When censored time-to-event data are used to map quantitative trait loci (QTL), the existence of nonsusceptible subjects entails extra challenges. If the heterogeneous susceptibility is ignored or inappropriately handled, we may either fail to detect the responsible genetic factors or find spuriously significant locations. In this article, an interval mapping method based on parametric mixture cure models is proposed, which takes into consideration of nonsusceptible subjects. The proposed model can be used to detect the QTL that are responsible for differential susceptibility and/or time-to-event trait distribution. In particular, we propose a likelihood-based testing procedure with genome-wide significance levels calculated using a resampling method. The performance of the proposed method and the importance of considering the heterogeneous susceptibility are demonstrated by simulation studies and an application to survival data from an experiment on mice infected with Listeria monocytogenes.}, number={4}, journal={LIFETIME DATA ANALYSIS}, author={Liu, Mengling and Lu, Wenbin and Shao, Yongzhao}, year={2006}, month={Dec}, pages={421–440} } @article{lu_tsiatis_2006, title={Semiparametric transformation models for the case-cohort study}, volume={93}, ISSN={["0006-3444"]}, DOI={10.1093/biomet/93.1.207}, abstractNote={A general class of semiparametric transformation models is studied for analysing survival data from the case-cohort design, which was introduced by Prentice (1986). Weighted estimating equations are proposed for simultaneous estimation of the regression parameters and the transformation function. It is shown that the resulting regression estimators are asymptotically normal, with variance-covariance matrix that has a closed form and can be consistently estimated by the usual plug-in method. Simulation studies show that the proposed approach is appropriate for practical use. An application to a case-cohort dataset from the Atherosclerosis Risk in Communities study is also given to illustrate the methodology. Copyright 2006, Oxford University Press.}, number={1}, journal={BIOMETRIKA}, author={Lu, WB and Tsiatis, AA}, year={2006}, month={Mar}, pages={207–214} } @article{lu_2005, title={Marginal regression of multivariate event times based on linear transformation models}, volume={11}, ISSN={["1380-7870"]}, DOI={10.1007/s10985-005-2969-4}, abstractNote={Multivariate event time data are common in medical studies and have received much attention recently. In such data, each study subject may potentially experience several types of events or recurrences of the same type of event, or event times may be clustered. Marginal distributions are specified for the multivariate event times in multiple events and clustered events data, and for the gap times in recurrent events data, using the semiparametric linear transformation models while leaving the dependence structures for related events unspecified. We propose several estimating equations for simultaneous estimation of the regression parameters and the transformation function. It is shown that the resulting regression estimators are asymptotically normal, with variance-covariance matrix that has a closed form and can be consistently estimated by the usual plug-in method. Simulation studies show that the proposed approach is appropriate for practical use. An application to the well-known bladder cancer tumor recurrences data is also given to illustrate the methodology.}, number={3}, journal={LIFETIME DATA ANALYSIS}, author={Lu, WB}, year={2005}, month={Sep}, pages={389–404} }