@article{gill_lester_free_pfaff_iversen_reich_yang_ahmadia_brown_darling_et al._2024, title={A diverse portfolio of marine protected areas can better advance global conservation and equity}, volume={121}, ISSN={["1091-6490"]}, DOI={10.1073/pnas.2313205121}, abstractNote={Marine protected areas (MPAs) are widely used for ocean conservation, yet the relative impacts of various types of MPAs are poorly understood. We estimated impacts on fish biomass from no-take and multiple-use (fished) MPAs, employing a rigorous matched counterfactual design with a global dataset of >14,000 surveys in and around 216 MPAs. Both no-take and multiple-use MPAs generated positive conservation outcomes relative to no protection (58.2% and 12.6% fish biomass increases, respectively), with smaller estimated differences between the two MPA types when controlling for additional confounding factors (8.3% increase). Relative performance depended on context and management: no-take MPAs performed better in areas of high human pressure but similar to multiple-use in remote locations. Multiple-use MPA performance was low in high-pressure areas but improved significantly with better management, producing similar outcomes to no-take MPAs when adequately staffed and appropriate use regulations were applied. For priority conservation areas where no-take restrictions are not possible or ethical, our findings show that a portfolio of well-designed and well-managed multiple-use MPAs represents a viable and potentially equitable pathway to advance local and global conservation.}, number={10}, journal={PROCEEDINGS OF THE NATIONAL ACADEMY OF SCIENCES OF THE UNITED STATES OF AMERICA}, author={Gill, David A. and Lester, Sarah E. and Free, Christopher M. and Pfaff, Alexander and Iversen, Edwin and Reich, Brian J. and Yang, Shu and Ahmadia, Gabby and Brown, Dominic A. Andradi- and Darling, Emily S. and et al.}, year={2024}, month={Mar} } @misc{colnet_mayer_chen_dieng_li_varoquaux_vert_josse_yang_2024, title={Causal Inference Methods for Combining Randomized Trials and Observational Studies: A Review}, volume={39}, ISSN={["2168-8745"]}, DOI={10.1214/23-STS889}, abstractNote={With increasing data availability, causal effects can be evaluated across different data sets, both randomized controlled trials (RCTs) and observational studies. RCTs isolate the effect of the treatment from that of unwanted (confounding) co-occurring effects but they may suffer from unrepresentativeness, and thus lack external validity. On the other hand, large observational samples are often more representative of the target population but can conflate confounding effects with the treatment of interest. In this paper, we review the growing literature on methods for causal inference on combined RCTs and observational studies, striving for the best of both worlds. We first discuss identification and estimation methods that improve generalizability of RCTs using the representativeness of observational data. Classical estimators include weighting, difference between conditional outcome models, and doubly robust estimators. We then discuss methods that combine RCTs and observational data to either ensure uncounfoundedness of the observational analysis or to improve (conditional) average treatment effect estimation. We also connect and contrast works developed in both the potential outcomes literature and the structural causal model literature. Finally, we compare the main methods using a simulation study and real world data to analyze the effect of tranexamic acid on the mortality rate in major trauma patients. A review of available codes and new implementations is also provided.}, number={1}, journal={STATISTICAL SCIENCE}, author={Colnet, Benedicte and Mayer, Imke and Chen, Guanhua and Dieng, Awa and Li, Ruohong and Varoquaux, Gael and Vert, Jean -Philippe and Josse, Julie and Yang, Shu}, year={2024}, month={Feb}, pages={165–191} } @article{fairfax_yang_2024, title={Distributional imputation for the analysis of censored recurrent events}, volume={4}, ISSN={["1097-0258"]}, DOI={10.1002/sim.10087}, abstractNote={Longitudinal clinical trials for which recurrent events endpoints are of interest are commonly subject to missing event data. Primary analyses in such trials are often performed assuming events are missing at random, and sensitivity analyses are necessary to assess robustness of primary analysis conclusions to missing data assumptions. Control‐based imputation is an attractive approach in superiority trials for imposing conservative assumptions on how data may be missing not at random. A popular approach to implementing control‐based assumptions for recurrent events is multiple imputation (MI), but Rubin's variance estimator is often biased for the true sampling variability of the point estimator in the control‐based setting. We propose distributional imputation (DI) with corresponding wild bootstrap variance estimation procedure for control‐based sensitivity analyses of recurrent events. We apply control‐based DI to a type I diabetes trial. In the application and simulation studies, DI produced more reasonable standard error estimates than MI with Rubin's combining rules in control‐based sensitivity analyses of recurrent events.}, journal={STATISTICS IN MEDICINE}, author={Fairfax, Sarah R. and Yang, Shu}, year={2024}, month={Apr} } @article{liu_yang_zhang_liu_2024, title={Multiply robust estimators in longitudinal studies with missing data under control-based imputation}, volume={80}, ISSN={["1541-0420"]}, DOI={10.1093/biomtc/ujad036}, abstractNote={ABSTRACT}, number={1}, journal={BIOMETRICS}, author={Liu, Siyi and Yang, Shu and Zhang, Yilong and Liu, Guanghan}, year={2024}, month={Jan} } @article{wang_zhao_yang_tang_cui_li_faries_2024, title={Propensity score matching for estimating a marginal hazard ratio}, volume={5}, ISSN={["1097-0258"]}, DOI={10.1002/sim.10103}, abstractNote={Propensity score matching is commonly used to draw causal inference from observational survival data. However, its asymptotic properties have yet to be established, and variance estimation is still open to debate. We derive the statistical properties of the propensity score matching estimator of the marginal causal hazard ratio based on matching with replacement and a fixed number of matches. We also propose a double‐resampling technique for variance estimation that takes into account the uncertainty due to propensity score estimation prior to matching.}, journal={STATISTICS IN MEDICINE}, author={Wang, Tongrong and Zhao, Honghe and Yang, Shu and Tang, Shuhan and Cui, Zhanglin and Li, Li and Faries, Douglas E.}, year={2024}, month={May} } @article{lee_gao_ghosh_yang_2024, title={Transporting survival of an HIV clinical trial to the external target populations}, volume={3}, ISSN={["1520-5711"]}, DOI={10.1080/10543406.2024.2330216}, abstractNote={Due to the heterogeneity of the randomized controlled trial (RCT) and external target populations, the estimated treatment effect from the RCT is not directly applicable to the target population. For example, the patient characteristics of the ACTG 175 HIV trial are significantly different from that of the three external target populations of interest: US early-stage HIV patients, Thailand HIV patients, and southern Ethiopia HIV patients. This paper considers several methods to transport the treatment effect from the ACTG 175 HIV trial to the target populations beyond the trial population. Most transport methods focus on continuous and binary outcomes; on the contrary, we derive and discuss several transport methods for survival outcomes: an outcome regression method based on a Cox proportional hazard (PH) model, an inverse probability weighting method based on the models for treatment assignment, sampling score, and censoring, and a doubly robust method that combines both methods, called the augmented calibration weighting (ACW) method. However, as the PH assumption was found to be incorrect for the ACTG 175 trial, the methods that depend on the PH assumption may lead to the biased quantification of the treatment effect. To account for the violation of the PH assumption, we extend the ACW method with the linear spline-based hazard regression model that does not require the PH assumption. Applying the aforementioned methods for transportability, we explore the effect of PH assumption, or the violation thereof, on transporting the survival results from the ACTG 175 trial to various external populations.}, journal={JOURNAL OF BIOPHARMACEUTICAL STATISTICS}, author={Lee, Dasom and Gao, Chenyin and Ghosh, Sujit and Yang, Shu}, year={2024}, month={Mar} } @article{lee_yang_berry_stinchcombe_cohen_wang_2024, title={genRCT: a statistical analysis framework for generalizing RCT findings to real-world population}, volume={4}, ISSN={["1520-5711"]}, DOI={10.1080/10543406.2024.2333136}, abstractNote={When evaluating the real-world treatment effect, the analysis based on randomized clinical trials (RCTs) often introduces generalizability bias due to the difference in risk factors between the trial participants and the real-world patient population. This problem of lack of generalizability associated with the RCT-only analysis can be addressed by leveraging observational studies with large sample sizes that are representative of the real-world population. A set of novel statistical methods, termed "genRCT", for improving the generalizability of the trial has been developed using calibration weighting, which enforces the covariates balance between the RCT and observational study. This paper aims to review statistical methods for generalizing the RCT findings by harnessing information from large observational studies that represent real-world patients. Specifically, we discuss the choices of data sources and variables to meet key theoretical assumptions and principles. We introduce and compare estimation methods for continuous, binary, and survival endpoints. We showcase the use of the R package genRCT through a case study that estimates the average treatment effect of adjuvant chemotherapy for the stage 1B non-small cell lung patients represented by a large cancer registry.}, journal={JOURNAL OF BIOPHARMACEUTICAL STATISTICS}, author={Lee, Dasom and Yang, Shu and Berry, Mark and Stinchcombe, Tom and Cohen, Harvey Jay and Wang, Xiaofei}, year={2024}, month={Apr} } @article{yang_gao_zeng_wang_2023, title={Elastic integrative analysis of randomised trial and real-world data for treatment heterogeneity estimation}, volume={4}, ISSN={["1467-9868"]}, url={https://doi.org/10.1093/jrsssb/qkad017}, DOI={10.1093/jrsssb/qkad017}, abstractNote={Abstract}, journal={JOURNAL OF THE ROYAL STATISTICAL SOCIETY SERIES B-STATISTICAL METHODOLOGY}, author={Yang, Shu and Gao, Chenyin and Zeng, Donglin and Wang, Xiaofei}, year={2023}, month={Apr} } @article{gao_yang_2023, title={Pretest estimation in combining probability and non-probability samples}, volume={17}, ISSN={["1935-7524"]}, DOI={10.1214/23-EJS2137}, abstractNote={Multiple heterogeneous data sources are becoming increasingly available for statistical analyses in the era of big data. As an important example in finite-population inference, we develop a unified framework of the test-and-pool approach to general parameter estimation by combining gold-standard probability and non-probability samples. We focus on the case when the study variable is observed in both datasets for estimating the target parameters, and each contains other auxiliary variables. Utilizing the probability design, we conduct a pretest procedure to determine the comparability of the non-probability data with the probability data and decide whether or not to leverage the non-probability data in a pooled analysis. When the probability and non-probability data are comparable, our approach combines both data for efficient estimation. Otherwise, we retain only the probability data for estimation. We also characterize the asymptotic distribution of the proposed test-and-pool estimator under a local alternative and provide a data-adaptive procedure to select the critical tuning parameters that target the smallest mean square error of the test-and-pool estimator. Lastly, to deal with the non-regularity of the test-and-pool estimator, we construct a robust confidence interval that has a good finite-sample coverage property.}, number={1}, journal={ELECTRONIC JOURNAL OF STATISTICS}, author={Gao, Chenyin and Yang, Shu}, year={2023}, pages={1492–1546} } @article{liu_zhang_golm_liu_yang_2023, title={Robust analyzes for longitudinal clinical trials with missing and non-normal continuous outcomes}, volume={9}, ISSN={["2475-4277"]}, DOI={10.1080/24754269.2023.2261351}, abstractNote={Missing data is unavoidable in longitudinal clinical trials, and outcomes are not always normally distributed. In the presence of outliers or heavy-tailed distributions, the conventional multiple imputation with the mixed model with repeated measures analysis of the average treatment effect (ATE) based on the multivariate normal assumption may produce bias and power loss. Control-based imputation (CBI) is an approach for evaluating the treatment effect under the assumption that participants in both the test and control groups with missing outcome data have a similar outcome profile as those with an identical history in the control group. We develop a robust framework to handle non-normal outcomes under CBI without imposing any parametric modeling assumptions. Under the proposed framework, sequential weighted robust regressions are applied to protect the constructed imputation model against non-normality in the covariates and the response variables. Accompanied by the subsequent mean imputation and robust model analysis, the resulting ATE estimator has good theoretical properties in terms of consistency and asymptotic normality. Moreover, our proposed method guarantees the analysis model robustness of the ATE estimation in the sense that its asymptotic results remain intact even when the analysis model is misspecified. The superiority of the proposed robust method is demonstrated by comprehensive simulation studies and an AIDS clinical trial data application.}, journal={STATISTICAL THEORY AND RELATED FIELDS}, author={Liu, Siyi and Zhang, Yilong and Golm, Gregory T. and Liu, Guanghan and Yang, Shu}, year={2023}, month={Sep} } @article{gao_yang_kim_2023, title={Soft calibration for selection bias problems under mixed-effects models}, volume={3}, ISSN={["1464-3510"]}, url={https://doi.org/10.1093/biomet/asad016}, DOI={10.1093/biomet/asad016}, abstractNote={Abstract}, journal={BIOMETRIKA}, author={Gao, Chenyin and Yang, Shu and Kim, Jae Kwang}, year={2023}, month={Mar} } @article{chu_lu_yang_2023, title={Targeted optimal treatment regime learning using summary statistics}, volume={3}, ISSN={["1464-3510"]}, DOI={10.1093/biomet/asad020}, abstractNote={Summary}, journal={BIOMETRIKA}, author={Chu, J. and Lu, W. and Yang, S.}, year={2023}, month={Mar} } @article{larsen_yang_reich_rappold_2022, title={A SPATIAL CAUSAL ANALYSIS OF WILDLAND FIRE-CONTRIBUTED PM2.5 USING NUMERICAL MODEL OUTPUT}, volume={16}, ISSN={["1941-7330"]}, DOI={10.1214/22-AOAS1610}, abstractNote={Wildland fire smoke contains hazardous levels of fine particulate matter (PM2.5), a pollutant shown to adversely effect health. Estimating fire attributable PM2.5 concentrations is key to quantifying the impact on air quality and subsequent health burden. This is a challenging problem since only total PM2.5 is measured at monitoring stations and both fire-attributable PM2.5 and PM2.5 from all other sources are correlated in space and time. We propose a framework for estimating fire-contributed PM2.5 and PM2.5 from all other sources using a novel causal inference framework and bias-adjusted chemical model representations of PM2.5 under counterfactual scenarios. The chemical model representation of PM2.5 for this analysis is simulated using Community Multiscale Air Quality Modeling System (CMAQ), run with and without fire emissions across the contiguous U.S. for the 2008-2012 wildfire seasons. The CMAQ output is calibrated with observations from monitoring sites for the same spatial domain and time period. We use a Bayesian model that accounts for spatial variation to estimate the effect of wildland fires on PM2.5 and state assumptions under which the estimate has a valid causal interpretation. Our results include estimates of the contributions of wildfire smoke to PM2.5 for the contiguous U.S. Additionally, we compute the health burden associated with the PM2.5 attributable to wildfire smoke.}, number={4}, journal={ANNALS OF APPLIED STATISTICS}, author={Larsen, Alexandra and Yang, Shu and Reich, Brian J. and Rappold, Ana G.}, year={2022}, month={Dec}, pages={2714–2731} } @article{yu_lu_yang_ghosh_2022, title={A multiplicative structural nested mean model for zero-inflated outcomes}, volume={8}, ISSN={["1464-3510"]}, DOI={10.1093/biomet/asac050}, abstractNote={Summary}, journal={BIOMETRIKA}, author={Yu, Miao and Lu, Wenbin and Yang, Shu and Ghosh, Pulak}, year={2022}, month={Aug} } @article{reich_yang_guan_2022, title={Discussion on "Spatial plus : A novel approach to spatial confounding" by Dupont, Wood, and Augustin}, volume={3}, ISSN={["1541-0420"]}, DOI={10.1111/biom.13651}, abstractNote={Congratulations to the authors for this thoughtful and timely contribution to the spatial confounding literature. The intuitive nature of the method and simplicity of the estimation procedure will surely make Spatial+ popular with practitioners, and the theoretical developments are a major advance for researchers in this area. There is much to discuss! We have formatted our discussion in two sections: in Section 2 we consider the assumptions and statistical properties of Spatial+, and in Section 3 we examine how Spatial+ fits in the wider literature on spatial causal inference.}, journal={BIOMETRICS}, author={Reich, Brian J. and Yang, Shu and Guan, Yawen}, year={2022}, month={Mar} } @article{zhao_zhang_yang_2022, title={Double score matching in observational studies with multi-level treatments}, volume={8}, ISSN={["1532-4141"]}, DOI={10.1080/03610918.2022.2118778}, abstractNote={While weighting methods are popular for comparing the effects of multi-level treatment in observational studies, their performance can be unstable in the presence of extreme values of the generalized propensity score (GPS). Matching methods are more resistant to GPS outliers but bear the risk of GPS model misspecification. In this article, we propose a double score matching (DSM) estimator of the pairwise average treatment effect (ATE) based on the GPS and the generalized prognostic score (GPGS) evaluated at one treatment level at a time. The de-biased DSM estimator not only maintains the advantage of matching methods but also alleviates the model dependence problem due to its double robustness: it consistently estimates the true pairwise ATE if either the GPS or the GPGS is correctly specified.}, journal={COMMUNICATIONS IN STATISTICS-SIMULATION AND COMPUTATION}, author={Zhao, Honghe and Zhang, Xiaofei and Yang, Shu}, year={2022}, month={Aug} } @article{lee_yang_wang_2022, title={Doubly robust estimators for generalizing treatment effects on survival outcomes from randomized controlled trials to a target population}, volume={10}, ISSN={["2193-3685"]}, DOI={10.1515/jci-2022-0004}, abstractNote={Abstract}, number={1}, journal={JOURNAL OF CAUSAL INFERENCE}, author={Lee, Dasom and Yang, Shu and Wang, Xiaofei}, year={2022}, month={Dec}, pages={415–440} } @article{giffin_gong_majumder_rappold_reich_yang_2022, title={Estimating intervention effects on infectious disease control: The effect of community mobility reduction on Coronavirus spread}, volume={52}, ISSN={["2211-6753"]}, DOI={10.1016/j.spasta.2022.100711}, abstractNote={Understanding the effects of interventions, such as restrictions on community and large group gatherings, is critical to controlling the spread of COVID-19. Susceptible-Infectious-Recovered (SIR) models are traditionally used to forecast the infection rates but do not provide insights into the causal effects of interventions. We propose a spatiotemporal model that estimates the causal effect of changes in community mobility (intervention) on infection rates. Using an approximation to the SIR model and incorporating spatiotemporal dependence, the proposed model estimates a direct and indirect (spillover) effect of intervention. Under an interference and treatment ignorability assumption, this model is able to estimate causal intervention effects, and additionally allows for spatial interference between locations. Reductions in community mobility were measured by cell phone movement data. The results suggest that the reductions in mobility decrease Coronavirus cases 4 to 7 weeks after the intervention.}, journal={SPATIAL STATISTICS}, author={Giffin, Andrew and Gong, Wenlong and Majumder, Suman and Rappold, Ana G. and Reich, Brian J. and Yang, Shu}, year={2022}, month={Dec} } @article{wang_wong_yang_chan_2022, title={Estimation of partially conditional average treatment effect by double kernel-covariate balancing}, volume={16}, ISSN={["1935-7524"]}, DOI={10.1214/22-EJS2000}, abstractNote={We study nonparametric estimation for the partially conditional average treatment effect, defined as the treatment effect function over an interested subset of confounders. We propose a double kernel weighting estimator where the weights aim to control the balancing error of any function of the confounders from a reproducing kernel Hilbert space after kernel smoothing over the interested subset of variables. In addition, we present an augmented version of our estimator which can incorporate the estimation of outcome mean functions. Based on the representer theorem, gradient-based algorithms can be applied for solving the corresponding infinite-dimensional optimization problem. Asymptotic properties are studied without any smoothness assumptions for the propensity score function or the need for data splitting, relaxing certain existing stringent assumptions. The numerical performance of the proposed estimator is demonstrated by a simulation study and an application to the effect of a mother’s smoking on a baby’s birth weight conditioned on the mother’s age.}, number={2}, journal={ELECTRONIC JOURNAL OF STATISTICS}, author={Wang, Jiayi and Wong, Raymond K. W. and Yang, Shu and Chan, Kwun Chuen Gary}, year={2022}, pages={4332–4378} } @article{giffin_reich_yang_rappold_2022, title={Generalized propensity score approach to causal inference with spatial interference}, volume={9}, ISSN={["1541-0420"]}, DOI={10.1111/biom.13745}, abstractNote={Abstract}, journal={BIOMETRICS}, author={Giffin, A. and Reich, B. J. and Yang, S. and Rappold, A. G.}, year={2022}, month={Sep} } @article{kong_yang_wang_2022, title={Identifiability of causal effects with multiple causes and a binary outcome}, volume={109}, ISSN={["1464-3510"]}, DOI={10.1093/biomet/asab016}, abstractNote={Summary}, number={1}, journal={BIOMETRIKA}, author={Kong, Dehan and Yang, Shu and Wang, Linbo}, year={2022}, month={Feb}, pages={265–272} } @article{lee_yang_dong_wang_zeng_cai_2022, title={Improving trial generalizability using observational studies}, volume={1}, ISSN={["1541-0420"]}, DOI={10.1111/biom.13609}, abstractNote={Abstract}, journal={BIOMETRICS}, author={Lee, Dasom and Yang, Shu and Dong, Lin and Wang, Xiaofei and Zeng, Donglin and Cai, Jianwen}, year={2022}, month={Jan} } @article{mao_wang_yang_2022, title={Matrix completion under complex survey sampling}, volume={9}, ISSN={["1572-9052"]}, DOI={10.1007/s10463-022-00851-5}, journal={ANNALS OF THE INSTITUTE OF STATISTICAL MATHEMATICS}, author={Mao, Xiaojun and Wang, Zhonglei and Yang, Shu}, year={2022}, month={Sep} } @article{jiang_yang_ding_2022, title={Multiply robust estimation of causal effects under principal ignorability}, volume={5}, ISSN={["1467-9868"]}, DOI={10.1111/rssb.12538}, abstractNote={Abstract}, journal={JOURNAL OF THE ROYAL STATISTICAL SOCIETY SERIES B-STATISTICAL METHODOLOGY}, author={Jiang, Zhichao and Yang, Shu and Ding, Peng}, year={2022}, month={May} } @article{yang_zhang_2022, title={Multiply robust matching estimators of average and quantile treatment effects}, volume={4}, ISSN={["1467-9469"]}, DOI={10.1111/sjos.12585}, abstractNote={Abstract}, journal={SCANDINAVIAN JOURNAL OF STATISTICS}, author={Yang, Shu and Zhang, Yunshu}, year={2022}, month={Apr} } @article{gao_thompson_kim_yang_2022, title={Nearest neighbour ratio imputation with incomplete multinomial outcome in survey sampling}, volume={5}, ISSN={["1467-985X"]}, url={https://doi.org/10.1111/rssa.12841}, DOI={10.1111/rssa.12841}, abstractNote={Abstract}, journal={JOURNAL OF THE ROYAL STATISTICAL SOCIETY SERIES A-STATISTICS IN SOCIETY}, author={Gao, Chenyin and Thompson, Katherine Jenny and Kim, Jae Kwang and Yang, Shu}, year={2022}, month={May} } @article{chen_yang_kim_2022, title={Nonparametric Mass Imputation for Data Integration}, volume={10}, ISSN={["2325-0992"]}, DOI={10.1093/jssam/smaa036}, abstractNote={Abstract}, number={1}, journal={JOURNAL OF SURVEY STATISTICS AND METHODOLOGY}, author={Chen, Sixia and Yang, Shu and Kim, Jae Kwang}, year={2022}, month={Jan}, pages={1–24} } @article{zhao_yang_2022, title={Outcome-adjusted balance measure for generalized propensity score model selection}, volume={221}, ISSN={["1873-1171"]}, DOI={10.1016/j.jspi.2022.04.004}, abstractNote={In this article, we propose the outcome-adjusted balance measure to perform model selection for the generalized propensity score (GPS), which serves as an essential component in estimation of the pairwise average treatment effects (ATEs) in observational studies with more than two treatment levels. The primary goal of the balance measure is to identify the GPS model specification such that the resulting ATE estimator is consistent and efficient. Following recent empirical and theoretical evidence, we establish that the optimal GPS model should only include covariates related to the outcomes. Given a collection of candidate GPS models, the outcome-adjusted balance measure imputes all baseline covariates by matching on each candidate model, and selects the model that minimizes a weighted sum of absolute mean differences between the imputed and original values of the covariates. The weights are defined to leverage the covariate–outcome relationship, so that GPS models without optimal variable selection are penalized. Under appropriate assumptions, we show that the outcome-adjusted balance measure consistently selects the optimal GPS model, so that the resulting GPS matching estimator is asymptotically normal and efficient. We compare its finite sample performance with existing measures in a simulation study. We apply the proposed method to two real data applications.}, journal={JOURNAL OF STATISTICAL PLANNING AND INFERENCE}, author={Zhao, Honghe and Yang, Shu}, year={2022}, month={Dec}, pages={188–200} } @article{huang_yang_2022, title={ROBUST INFERENCE OF CONDITIONAL AVERAGE TREATMENT EFFECTS USING DIMENSION REDUCTION}, volume={32}, ISSN={["1996-8507"]}, DOI={10.5705/ss.202020.0409}, abstractNote={Personalized treatment aims at tailoring treatments to individual characteristics. An important step is to understand how a treatment effect varies across individual characteristics, known as the conditional average treatment effect (CATE). In this study, we make robust inferences of the CATE from observational data, which becomes challenging with a multivariate confounder. To reduce the curse of dimensionality, while keeping the nonparametric advantages, we propose double dimension reductions that achieve different goal. First, we identify the central mean subspace of the CATE directly using dimension reduction in order to detect the most accurate and parsimonious structure of the CATE. Second, we use a nonparametric regression with a prior dimension reduction to impute counterfactual outcomes, which helps to improve the stability of the imputation. We establish the asymptotic properties of the proposed estimator, taking into account the two-step double dimension reduction, and propose an effective bootstrapping procedure without bootstrapping the estimated central mean subspace to make valid inferences. A simulation and applications show that the proposed estimator outperforms existing competitors.}, journal={STATISTICA SINICA}, author={Huang, Ming-Yueh and Yang, Shu}, year={2022}, pages={547–567} } @article{yang_2022, title={Semiparametric estimation of structural nested mean models with irregularly spaced longitudinal observations}, volume={78}, ISSN={["1541-0420"]}, DOI={10.1111/biom.13471}, abstractNote={Abstract}, number={3}, journal={BIOMETRICS}, author={Yang, Shu}, year={2022}, month={Sep}, pages={937–949} } @article{liu_yang_zhang_liu_2022, title={Sensitivity analyses in longitudinal clinical trials via distributional imputation}, volume={11}, ISSN={["1477-0334"]}, DOI={10.1177/09622802221135251}, abstractNote={ Missing data is inevitable in longitudinal clinical trials. Conventionally, the missing at random assumption is assumed to handle missingness, which however is unverifiable empirically. Thus, sensitivity analyses are critically important to assess the robustness of the study conclusions against untestable assumptions. Toward this end, regulatory agencies and the pharmaceutical industry use sensitivity models such as return-to-baseline, control-based, and washout imputation, following the ICH E9(R1) guidance. Multiple imputation is popular in sensitivity analyses; however, it may be inefficient and result in an unsatisfying interval estimation by Rubin’s combining rule. We propose distributional imputation in sensitivity analysis, which imputes each missing value by samples from its target imputation model given the observed data. Drawn on the idea of Monte Carlo integration, the distributional imputation estimator solves the mean estimating equations of the imputed dataset. It is fully efficient with theoretical guarantees. Moreover, we propose weighted bootstrap to obtain a consistent variance estimator, taking into account the variabilities due to model parameter estimation and target parameter estimation. The superiority of the distributional imputation framework is validated in the simulation study and an antidepressant longitudinal clinical trial. }, journal={STATISTICAL METHODS IN MEDICAL RESEARCH}, author={Liu, Siyi and Yang, Shu and Zhang, Yilong and Liu, Guanghan}, year={2022}, month={Nov} } @article{guan_page_reich_ventrucci_yang_2022, title={Spectral adjustment for spatial confounding}, volume={12}, ISSN={["1464-3510"]}, DOI={10.1093/biomet/asac069}, abstractNote={Summary}, journal={BIOMETRIKA}, author={Guan, Yawen and Page, Garritt L. and Reich, Brian J. and Ventrucci, Massimo and Yang, Shu}, year={2022}, month={Dec} } @article{yang_kim_2022, title={Statistical data integration in survey sampling: a review (vol 3, pg 625, 2020)}, volume={3}, ISSN={["2520-8764"]}, DOI={10.1007/s42081-022-00152-4}, journal={JAPANESE JOURNAL OF STATISTICS AND DATA SCIENCE}, author={Yang, Shu and Kim, Jae Kwang}, year={2022}, month={Mar} } @article{wu_yang_2022, title={Transfer Learning of Individualized Treatment Rules from Experimental to Real-World Data}, volume={11}, ISSN={["1537-2715"]}, DOI={10.1080/10618600.2022.2141752}, abstractNote={Abstract Individualized treatment effect lies at the heart of precision medicine. Interpretable individualized treatment rules (ITRs) are desirable for clinicians or policymakers due to their intuitive appeal and transparency. The gold-standard approach to estimating the ITRs is randomized experiments, where subjects are randomized to different treatment groups and the confounding bias is minimized to the extent possible. However, experimental studies are limited in external validity because of their selection restrictions, and therefore the underlying study population is not representative of the target real-world population. Conventional learning methods of optimal interpretable ITRs for a target population based only on experimental data are biased. On the other hand, real-world data (RWD) are becoming popular and provide a representative sample of the target population. To learn the generalizable optimal interpretable ITRs, we propose an integrative transfer learning method based on weighting schemes to calibrate the covariate distribution of the experiment to that of the RWD. Theoretically, we establish the risk consistency for the proposed ITR estimator. Empirically, we evaluate the finite-sample performance of the transfer learner through simulations and apply it to a real data application of a job training program. Supplementary materials for this article are available online.}, journal={JOURNAL OF COMPUTATIONAL AND GRAPHICAL STATISTICS}, author={Wu, Lili and Yang, Shu}, year={2022}, month={Nov} } @article{johnson_pieper_yang_2022, title={Treatment-specific marginal structural Cox model for the effect of treatment discontinuation}, volume={3}, ISSN={["1539-1612"]}, DOI={10.1002/pst.2211}, abstractNote={Abstract}, journal={PHARMACEUTICAL STATISTICS}, author={Johnson, Dana and Pieper, Karen and Yang, Shu}, year={2022}, month={Mar} } @article{corder_yang_2022, title={Utilizing stratified generalized propensity score matching to approximate blocked randomized designs with multiple treatment levels}, volume={6}, ISSN={["1520-5711"]}, DOI={10.1080/10543406.2022.2065507}, abstractNote={ABSTRACT Conducting causal inference in settings with more than one treatment level can be challenging. Classical methods, such as propensity score matching (PSM), are restricted to only a binary treatment. To extend propensity score methods beyond a binary treatment, generalized propensity score methods have been proposed, with generalized propensity score matching (GPSM) standing as the multi-level treatment analog to PSM. One drawback of GPSM is it is only capable of emulating a completely randomized trial (CRT) design and not the more efficient blocked randomized trial design. Motivated by the desire to emulate the more efficient design, we expand on GPSM estimating literature and develop a new estimator incorporating relevant stratifying variables into the GPSM framework. We examine the variance estimation methods available for GPSM and demonstrate how to extend the estimator to one where stratifying variables are included. While it would be straightforward to include relevant stratifying variables as covariates in the propensity score estimation, our method provides for researchers to conduct retrospective analyses more consistently with the prospective experiment they would have designed if permitted. Namely, our method permits researchers to approximate a stratified randomized trial as opposed to the CRT otherwise obtainable by GPSM. We apply our proposed method to an analysis of how the number of children in a household affects systolic blood pressure in adults. We conduct a simulation study assessing how the relationship between response, treatment, and strata affect the performance of our method and compare the results to non-stratified GPSM.}, journal={JOURNAL OF BIOPHARMACEUTICAL STATISTICS}, author={Corder, Nathan and Yang, Shu}, year={2022}, month={Jun} } @article{reich_yang_guan_giffin_miller_rappold_2021, title={A Review of Spatial Causal Inference Methods for Environmental and Epidemiological Applications}, volume={5}, DOI={10.1111/insr.12452}, abstractNote={Summary}, journal={INTERNATIONAL STATISTICAL REVIEW}, author={Reich, Brian and Yang, Shu and Guan, Yawen and Giffin, Andrew B. and Miller, Matthew J. and Rappold, Ana}, year={2021} } @article{zhang_yang_ye_faries_lipkovich_kadziola_2021, title={Practical recommendations on double score matching for estimating causal effects}, volume={12}, ISSN={["1097-0258"]}, DOI={10.1002/sim.9289}, abstractNote={Unlike in randomized clinical trials (RCTs), confounding control is critical for estimating the causal effects from observational studies due to the lack of treatment randomization. Under the unconfoundedness assumption, matching methods are popular because they can be used to emulate an RCT that is hidden in the observational study. To ensure the key assumption hold, the effort is often made to collect a large number of possible confounders, rendering dimension reduction imperative in matching. Three matching schemes based on the propensity score (PSM), prognostic score (PGM), and double score (DSM, ie, the collection of the first two scores) have been proposed in the literature. However, a comprehensive comparison is lacking among the three matching schemes and has not made inroads into the best practices including variable selection, choice of caliper, and replacement. In this article, we explore the statistical and numerical properties of PSM, PGM, and DSM via extensive simulations. Our study supports that DSM performs favorably with, if not better than, the two single score matching in terms of bias and variance. In particular, DSM is doubly robust in the sense that the matching estimator is consistent requiring either the propensity score model or the prognostic score model is correctly specified. Variable selection on the propensity score model and matching with replacement is suggested for DSM, and we illustrate the recommendations with comprehensive simulation studies. An R package is available at https://github.com/Yunshu7/dsmatch.}, journal={STATISTICS IN MEDICINE}, author={Zhang, Yunshu and Yang, Shu and Ye, Wenyu and Faries, Douglas E. and Lipkovich, Ilya and Kadziola, Zbigniew}, year={2021}, month={Dec} } @article{cools_johnson_camm_bassand_verheugt_yang_tsiatis_fitzmaurice_goldhaber_kayani_et al._2021, title={Risks associated with discontinuation of oral anticoagulation in newly diagnosed patients with atrial fibrillation: Results from the GARFIELD-AF Registry}, volume={7}, ISSN={["1538-7836"]}, DOI={10.1111/jth.15415}, abstractNote={Oral anticoagulation (OAC) in atrial fibrillation (AF) reduces the risk of stroke/systemic embolism (SE). The impact of OAC discontinuation is less well documented.}, journal={JOURNAL OF THROMBOSIS AND HAEMOSTASIS}, author={Cools, Frank and Johnson, Dana and Camm, Alan J. and Bassand, Jean-Pierre and Verheugt, Freek W. A. and Yang, Shu and Tsiatis, Anastasios and Fitzmaurice, David A. and Goldhaber, Samuel Z. and Kayani, Gloria and et al.}, year={2021}, month={Jul} } @article{yang_zhang_liu_guan_2021, title={SMIM: A unified framework of survival sensitivity analysis using multiple imputation and martingale}, volume={9}, ISSN={["1541-0420"]}, DOI={10.1111/biom.13555}, abstractNote={Abstract}, journal={BIOMETRICS}, author={Yang, Shu and Zhang, Yilong and Liu, Guanghan Frank and Guan, Qian}, year={2021}, month={Sep} } @article{dong_laber_goldberg_song_yang_2020, title={Ascertaining properties of weighting in the estimation of optimal treatment regimes under monotone missingness}, volume={39}, ISSN={["1097-0258"]}, DOI={10.1002/sim.8678}, abstractNote={Dynamic treatment regimes operationalize precision medicine as a sequence of decision rules, one per stage of clinical intervention, that map up‐to‐date patient information to a recommended intervention. An optimal treatment regime maximizes the mean utility when applied to the population of interest. Methods for estimating an optimal treatment regime assume the data to be fully observed, which rarely occurs in practice. A common approach is to first use multiple imputation and then pool the estimators across imputed datasets. However, this approach requires estimating the joint distribution of patient trajectories, which can be high‐dimensional, especially when there are multiple stages of intervention. We examine the application of inverse probability weighted estimating equations as an alternative to multiple imputation in the context of monotonic missingness. This approach applies to a broad class of estimators of an optimal treatment regime including both Q‐learning and a generalization of outcome weighted learning. We establish consistency under mild regularity conditions and demonstrate its advantages in finite samples using a series of simulation experiments and an application to a schizophrenia study.}, number={25}, journal={STATISTICS IN MEDICINE}, author={Dong, Lin and Laber, Eric and Goldberg, Yair and Song, Rui and Yang, Shu}, year={2020}, month={Nov}, pages={3503–3520} } @book{yang_zhang_2020, title={Double score matching estimators of average and quantile treatment effects}, url={https://arxiv.org/abs/2001.06049}, author={Yang, S. and Zhang, Y.}, year={2020} } @article{yang_kim_song_2020, title={Doubly robust inference when combining probability and non-probability samples with high dimensional data}, volume={1}, ISSN={1369-7412}, url={http://dx.doi.org/10.1111/rssb.12354}, DOI={10.1111/rssb.12354}, abstractNote={Summary}, journal={Journal of the Royal Statistical Society: Series B (Statistical Methodology)}, publisher={Wiley}, author={Yang, Shu and Kim, Jae Kwang and Song, Rui}, year={2020}, month={Jan} } @article{corder_yang_2020, title={Estimating Average Treatment Effects Utilizing Fractional Imputation when Confounders are Subject to Missingness}, volume={8}, ISSN={["2193-3685"]}, DOI={10.1515/jci-2019-0024}, abstractNote={Abstract}, number={1}, journal={JOURNAL OF CAUSAL INFERENCE}, author={Corder, Nathan and Yang, Shu}, year={2020}, month={Jan}, pages={249–271} } @book{corder_yang_2020, title={Estimating Average Treatment Effects Utilizing Fractional Imputation when Confounders are Subject to Missingness}, url={https://arxiv.org/pdf/1905.11497}, author={Corder, N. and Yang, S.}, year={2020} } @book{dong_yang_wang_zeng_cai_2020, title={Integrative analysis of randomized clinicaltrials with real world evidence studies}, url={https://arxiv.org/pdf/2003.01242}, author={Dong, L. and Yang, S. and Wang, X. and Zeng, D. and Cai, J.W.}, year={2020} } @article{li_yang_han_2020, title={Robust estimation for moment condition models with data missing not at random}, volume={207}, ISSN={["1873-1171"]}, DOI={10.1016/j.jspi.2020.01.001}, abstractNote={We consider estimation for parameters defined through moment conditions when data are missing not at random. The missingness mechanism cannot be determined from the data alone, and inference under missingness not at random may be sensitive to unverifiable assumptions about the missingness mechanism. To add protection against model misspecification, we posit multiple models for the response probability and propose a weighting estimator with calibrated weights. Assuming the conditional distribution of the outcome given covariates is correctly modeled, we show that if any one of the multiple models for the response probability is correctly specified, the proposed estimator is consistent for the true value. A simulation study confirms that our estimator has multiple robustness when the outcome data is missing not at random. The method is also applied to an application.}, journal={Journal of Statistical Planning and Inference}, author={Li, W. and Yang, S. and Han, P.}, year={2020}, month={Jul}, pages={246–254} } @misc{yang_kim_2020, title={Statistical data integration in survey sampling: a review}, volume={3}, ISSN={["2520-8764"]}, DOI={10.1007/s42081-020-00093-w}, abstractNote={Finite population inference is a central goal in survey sampling. Probability sampling is the main statistical approach to finite population inference. Challenges arise due to high cost and increasing non-response rates. Data integration provides a timely solution by leveraging multiple data sources to provide more robust and efficient inference than using any single data source alone. The technique for data integration varies depending on types of samples and available information to be combined. This article provides a systematic review of data integration techniques for combining probability samples, probability and non-probability samples, and probability and big data samples. We discuss a wide range of integration methods such as generalized least squares, calibration weighting, inverse probability weighting, mass imputation, and doubly robust methods. Finally, we highlight important questions for future research.}, number={2}, journal={JAPANESE JOURNAL OF STATISTICS AND DATA SCIENCE}, author={Yang, Shu and Kim, Jae Kwang}, year={2020}, month={Dec}, pages={625–650} } @book{yang_kim_2020, title={Statistical data integration in survey sampling: a review}, url={https://arxiv.org/abs/2001.03259}, author={Yang, S. and Kim, J.K.}, year={2020}, month={Jan} } @article{yang_kim_2019, title={Asymptotic theory and inference of predictive mean matching imputation using a superpopulation model framework}, volume={47}, ISSN={0303-6898 1467-9469}, url={http://dx.doi.org/10.1111/sjos.12429}, DOI={10.1111/sjos.12429}, abstractNote={Abstract}, number={3}, journal={Scandinavian Journal of Statistics}, publisher={Wiley}, author={Yang, Shu and Kim, Jae Kwang}, year={2019}, month={Dec}, pages={839–861} } @article{yang_wang_ding_2019, title={Causal inference with confounders missing not at random}, volume={106}, ISSN={0006-3444 1464-3510}, url={http://dx.doi.org/10.1093/biomet/asz048}, DOI={10.1093/biomet/asz048}, abstractNote={Summary}, number={4}, journal={Biometrika}, publisher={Oxford University Press (OUP)}, author={Yang, S and Wang, L and Ding, P}, year={2019}, month={Sep}, pages={875–888} } @article{yang_ding_2019, title={Combining Multiple Observational Data Sources to Estimate Causal Effects}, volume={6}, ISSN={0162-1459 1537-274X}, url={http://dx.doi.org/10.1080/01621459.2019.1609973}, DOI={10.1080/01621459.2019.1609973}, abstractNote={Abstract The era of big data has witnessed an increasing availability of multiple data sources for statistical analyses. We consider estimation of causal effects combining big main data with unmeasured confounders and smaller validation data with supplementary information on these confounders. Under the unconfoundedness assumption with completely observed confounders, the smaller validation data allow for constructing consistent estimators for causal effects, but the big main data can only give error-prone estimators in general. However, by leveraging the information in the big main data in a principled way, we can improve the estimation efficiencies yet preserve the consistencies of the initial estimators based solely on the validation data. Our framework applies to asymptotically normal estimators, including the commonly used regression imputation, weighting, and matching estimators, and does not require a correct specification of the model relating the unmeasured confounders to the observed variables. We also propose appropriate bootstrap procedures, which makes our method straightforward to implement using software routines for existing estimators. Supplementary materials for this article are available online.}, journal={Journal of the American Statistical Association}, publisher={Informa UK Limited}, author={Yang, Shu and Ding, Peng}, year={2019}, month={Jun}, pages={1–33} } @article{yang_zeng_2019, title={Discussion of "Penalized Spline of Propensity Methods for Treatment Comparison" by Zhou, Elliott, and Little}, volume={114}, ISSN={["1537-274X"]}, DOI={10.1080/01621459.2018.1537916}, abstractNote={We congratulate the authors on presenting this stimulating article. The article proposes a novel estimation of the average causal effects of the treatment strategies in the presence of timedependent confounding by indication. Causal inference under the potential outcomes framework can be viewed from a missing data perspective. The article extends the penalized spline of propensity methods in handling missing data to causal inference incorporating a temporal element into consideration. Under the sequential randomization assumption, Robins and his colleagues have proposed different approaches to handle time-varying confounding, such as g-computation (Robins and Greenland 1992) and marginal structural models (MSMs, Robins 2000) using (augmented) inverse probability weighting (A/IPW). However, g-computation is fully parametric and therefore is sensitive to model assumptions. Weighting requires the positivity assumption that the probability of receiving each treatment is strictly positive for all subjects. In many practices, the weights for some subjects can become extremely large, leading to both bias and large variability. In contrast with the predominantly propensity score weighting approaches, the authors have used the propensity score as the predictors in the outcome mean model in addition to the individual covariates. In particular, the authors adopt penalized splines for propensity scores to provide a flexible model for imputing time-varying potential outcomes; thus, the estimated treatment effects are likely immune to the misspecification of the propensity score when the imputation model is correctly specified. The proposed estimation method successfully avoids the drawbacks of the existing competitors: (i) it does not involve weighting by the inverse of the propensity score and therefore avoids the possibly large variability due to weighting, and (ii) it improves the robustness to model misspecification. Structural nested models: We would like to bring to the authors’ attention another class of structural models that has been proposed in the literature for a while to estimate the treatment effect in longitudinal observational studies, namely the structural nested models (SNMs). SNMs allow modeling time-varying treatment modification effects using the post baseline time-dependent covariates. For example, the structural nested mean model for a continuous outcome specifies the effect of treatments through γzt (Lt ; ψ0) = E(Yzt t − Y0 t | Lt ; ψ0), where Lt = (Xt , Zt−1). The g-estimation calculates}, number={525}, journal={JOURNAL OF THE AMERICAN STATISTICAL ASSOCIATION}, author={Yang, Shu and Zeng, Donglin}, year={2019}, month={Jan}, pages={31–32} } @article{yang_2019, title={Flexible Imputation of Missing Data, 2nd ed.}, volume={114}, ISSN={0162-1459 1537-274X}, url={http://dx.doi.org/10.1080/01621459.2019.1662249}, DOI={10.1080/01621459.2019.1662249}, abstractNote={Missing data are frequently encountered in practice. A broader class of missing data is called incomplete data, which includes data with measurement error, multilevel data with latent variables, and potential outcomes in causal inference. Due to its intuitive appeal, multiple imputation has been the most popular method for handling incomplete data: it multiply fills in the missing values and pools analyses based on straightforward rules. The book focuses mainly on the multiple imputation approach to incomplete data but not on alternative approaches, such as weighting procedures and likelihood-based approaches. The main objective of the book is to provide a tool kit for practitioners to execute multiple imputation. The first edition of this book (van Buuren 2012) has been popular and wellreceived in the statistics and applied research communities. The main text explains the basic and key ideas underpinning multiple imputation, how to implement it in practice, and how to report and interpret the results. Moreover, it uses a readerfriendly style with lots of worked-out examples for illustration based on the MICE package. Therefore, the book can also be viewed as an extended MICE tutorial. Drawing from the author’s own work and from the most recent developments in the field, the new edition expands discussions on important topics or incorporates new topics. This edition features new chapters on multiple imputation of multilevel data, causal inference by multiple imputation of the potential outcomes, and new dataadaptive algorithms for imputation such as predictive mean matching, trees, and many other machine learning tools. The aspect I like most about this book is that it is well-balanced between practicality and technicality. On the one hand, it avoids too much mathematical and technical details and uses graphical tools and visual displays to aid understanding. Practitioners are able to understand the big picture and follow the code provided in the book. On the other hand, theoreticians are able to find technical materials and references to journal articles for in-depth investigation. Divided into three parts, this book begins by an introduction of useful background material and an overview. Chapter 2 reviews the history of multiple imputation and introduces the general taxonomy of missing data mechanisms to set the stage for the whole book. Chapters 3 and 4 cover imputation methods for univariate missing data and multivariate missing data, respectively. Chapter 5 reviews post-imputation statistical analyses of the multiply imputed data. Dedicated statistical tests and model selection techniques are also included. The second part covers various advances of multiple imputation. Chapter 6 covers practical issues such as model specification and diagnosis for multivariate missing data. Chapter 7 is a new chapter discussing multiple imputation for nested data accounting for the multilevel data structure. Chapter 8 is also a new chapter, discussing multiple imputation for causal inference of individualized treatment effects. Chapters 9–11 use case studies to illustrate applications of multiple imputation with item non-response, measurement error, and drop out. The last part discusses limitations, reporting steps, and various extensions. Multiple imputation was originated for survey data, which often contain design weights (or sample weights) to account for sample selection. However, there is no coverage of such topics in the current book. It would be useful to include a new chapter devoted to multiple imputation with survey data. There are many important topics that are worth discussion, such as how to incorporate design weights in the imputation model, weighted or unweighted estimation, and valid inference procedures. In summary, the author has succeeded in achieving his objective of providing a practical introduction to multiple imputation for incomplete data. The book is particularly appropriate for substantive researchers and graduate students. For educators, the book would also make a reference for a course on the introduction and practical implementation of multiple imputation. Anyone who often utilizes complete-case analyses for handling missing data will find this book a good alternative to solve their problems.}, number={527}, journal={Journal of the American Statistical Association}, publisher={Informa UK Limited}, author={Yang, Shu}, year={2019}, month={Jul}, pages={1421–1421} } @book{mao_wang_yang_2019, title={Matrix completion for survey data prediction with multivariate missingness}, url={https://arxiv.org/pdf/1907.08360}, author={Mao, X. and Wang, Z. and Yang, S.}, year={2019}, month={Aug} } @book{kong_yang_wang_2019, title={Muti-cause causal inference with unmeasured confounding and binary outcome}, url={https://arxiv.org/pdf/1907.13323}, author={Kong, D. and Yang, S. and Wang, L.}, year={2019}, month={Jul} } @inbook{yang_kim_2019, title={Nearest Neighbor Imputation for General Parameter Estimation in Survey Sampling}, ISBN={9781787567269 9781787567252}, ISSN={0731-9053}, url={http://dx.doi.org/10.1108/s0731-905320190000039012}, DOI={10.1108/s0731-905320190000039012}, abstractNote={Abstract Nearest neighbor imputation has a long tradition for handling item nonresponse in survey sampling. In this article, we study the asymptotic properties of the nearest neighbor imputation estimator for general population parameters, including population means, proportions and quantiles. For variance estimation, we propose novel replication variance estimation, which is asymptotically valid and straightforward to implement. The main idea is to construct replicates of the estimator directly based on its asymptotically linear terms, instead of individual records of variables. The simulation results show that nearest neighbor imputation and the proposed variance estimation provide valid inferences for general population parameters.}, booktitle={Advances in Econometrics}, publisher={Emerald Publishing Limited}, author={Yang, Shu and Kim, Jae Kwang}, year={2019}, month={Mar}, pages={209–234} } @book{chen_yang_kim_2019, title={Nonparametric mass imputation for data integration}, number={#301796}, author={Chen, S. and Yang, S. and Kim, J.K.}, year={2019} } @article{yang_pieper_cools_2019, title={Semiparametric estimation of structural failure time models in continuous-time processes}, volume={10}, ISSN={0006-3444 1464-3510}, url={http://dx.doi.org/10.1093/biomet/asz057}, DOI={10.1093/biomet/asz057}, abstractNote={Summary}, journal={Biometrika}, publisher={Oxford University Press (OUP)}, author={Yang, S and Pieper, K and Cools, F}, year={2019}, month={Oct} } @article{wang_kim_yang_2018, title={Approximate Bayesian inference under informative sampling}, volume={105}, ISSN={["1464-3510"]}, DOI={10.1093/biomet/asx073}, abstractNote={Summary Statistical inference with complex survey data is challenging because the sampling design can be informative, and ignoring it can produce misleading results. Current methods of Bayesian inference under complex sampling assume that the sampling design is noninformative for the specified model. In this paper, we propose a Bayesian approach which uses the sampling distribution of a summary statistic to derive the posterior distribution of the parameters of interest. Asymptotic properties of the method are investigated. It is directly applicable to combining information from two independent surveys and to calibration estimation in survey sampling. A simulation study confirms that it can provide valid estimation under informative sampling. We apply it to a measurement error problem using data from the Korean Longitudinal Study of Aging.}, number={1}, journal={BIOMETRIKA}, author={Wang, Z. and Kim, J. K. and Yang, S.}, year={2018}, month={Mar}, pages={91–102} } @article{yang_ding_2018, title={Asymptotic inference of causal effects with observational studies trimmed by the estimated propensity scores}, volume={105}, ISSN={["1464-3510"]}, DOI={10.1093/biomet/asy008}, abstractNote={&NA; Causal inference with observational studies often relies on the assumptions of unconfoundedness and overlap of covariate distributions in different treatment groups. The overlap assumption is violated when some units have propensity scores close to 0 or 1, so both practical and theoretical researchers suggest dropping units with extreme estimated propensity scores. However, existing trimming methods often do not incorporate the uncertainty in this design stage and restrict inference to only the trimmed sample, due to the nonsmoothness of the trimming. We propose a smooth weighting, which approximates sample trimming and has better asymptotic properties. An advantage of our estimator is its asymptotic linearity, which ensures that the bootstrap can be used to make inference for the target population, incorporating uncertainty arising from both design and analysis stages. We extend the theory to the average treatment effect on the treated, suggesting trimming samples with estimated propensity scores close to 1.}, number={2}, journal={BIOMETRIKA}, author={Yang, S. and Ding, P.}, year={2018}, month={Jun}, pages={487–493} } @article{yang_kim_2018, title={Discussion: Dissecting Multiple Imputation from a Multi-phase Inference Perspective: What Happens When God's, Imputer's and Analyst's Models are Uncongenial? by X. Xie and X. L. Meng}, ISSN={1017-0405}, url={http://dx.doi.org/10.5705/ss.202016.0155}, DOI={10.5705/ss.202016.0155}, journal={Statistica Sinica}, publisher={Institute of Statistical Science}, author={Yang, Shu and Kim, Jae Kwang}, year={2018} } @article{lok_yang_sharkey_hughes_2018, title={Estimation of the cumulative incidence function under multiple dependent and independent censoring mechanisms}, volume={24}, ISSN={["1572-9249"]}, DOI={10.1007/s10985-017-9393-4}, abstractNote={Competing risks occur in a time-to-event analysis in which a patient can experience one of several types of events. Traditional methods for handling competing risks data presuppose one censoring process, which is assumed to be independent. In a controlled clinical trial, censoring can occur for several reasons: some independent, others dependent. We propose an estimator of the cumulative incidence function in the presence of both independent and dependent censoring mechanisms. We rely on semi-parametric theory to derive an augmented inverse probability of censoring weighted (AIPCW) estimator. We demonstrate the efficiency gained when using the AIPCW estimator compared to a non-augmented estimator via simulations. We then apply our method to evaluate the safety and efficacy of three anti-HIV regimens in a randomized trial conducted by the AIDS Clinical Trial Group, ACTG A5095.}, number={2}, journal={LIFETIME DATA ANALYSIS}, author={Lok, Judith J. and Yang, Shu and Sharkey, Brian and Hughes, Michael D.}, year={2018}, month={Apr}, pages={201–223} } @book{yang_kim_hwang_2018, title={Integration of survey and big observational data for finite population inference using mass imputation}, url={https://arxiv.org/abs/1807.02817}, author={Yang, S. and Kim, J.K. and Hwang, Youngdeok}, year={2018}, month={Jul} } @article{yang_tsiatis_blazing_2018, title={Modeling survival distribution as a function of time to treatment discontinuation: A dynamic treatment regime approach}, volume={74}, ISSN={["1541-0420"]}, DOI={10.1111/biom.12845}, abstractNote={Summary}, number={3}, journal={BIOMETRICS}, author={Yang, Shu and Tsiatis, Anastasios A. and Blazing, Michael}, year={2018}, month={Sep}, pages={900–909} } @article{yang_2018, title={Propensity Score Weighting for Causal Inference with Clustered Data}, volume={6}, ISSN={2193-3685}, url={http://dx.doi.org/10.1515/jci-2017-0027}, DOI={10.1515/jci-2017-0027}, abstractNote={Abstract}, number={2}, journal={Journal of Causal Inference}, publisher={Walter de Gruyter GmbH}, author={Yang, Shu}, year={2018}, month={Sep} } @article{yang_lok_2018, title={SENSITIVITY ANALYSIS FOR UNMEASURED CONFOUNDING IN COARSE STRUCTURAL NESTED MEAN MODELS}, volume={28}, ISSN={["1996-8507"]}, DOI={10.5705/ss.202016.0133}, abstractNote={Coarse Structural Nested Mean Models (SNMMs, Robins (2000)) and G-estimation can be used to estimate the causal effect of a time-varying treatment from longitudinal observational studies. However, they rely on an untestable assumption of no unmeasured confounding. In the presence of unmeasured confounders, the unobserved potential outcomes are not missing at random, and standard G-estimation leads to biased effect estimates. To remedy this, we investigate the sensitivity of G-estimators of coarse SNMMs to unmeasured confounding, assuming a nonidentifiable bias function which quantifies the impact of unmeasured confounding on the average potential outcome. We present adjusted G-estimators of coarse SNMM parameters and prove their consistency, under the bias modeling for unmeasured confounding. We apply this to a sensitivity analysis for the effect of the ART initiation time on the mean CD4 count at year 2 after infection in HIV-positive patients, based on the prospective Acute and Early Disease Research Program.}, number={4}, journal={STATISTICA SINICA}, author={Yang, Shu and Lok, Judith J.}, year={2018}, month={Oct}, pages={1703–1723} } @article{yang_2018, title={Semiparametric efficient estimation of structural nested mean models with irregularly spaced observations}, url={https://arxiv.org/abs/1810.00042}, author={Yang, S.}, year={2018}, month={Jan} } @article{kim_yang_2017, title={A note on multiple imputation under complex sampling}, volume={104}, number={1}, journal={Biometrika}, author={Kim, J. K. and Yang, S.}, year={2017}, pages={221–228} } @article{yang_kim_2017, title={A semiparametric inference to regression analysis with missing covariates in survey data}, volume={27}, ISSN={1017-0405}, url={http://dx.doi.org/10.5705/ss.2014.174}, DOI={10.5705/ss.2014.174}, abstractNote={Parameter estimation in parametric regression models with missing covariates is considered under a survey sampling setup. Under missingness at random, a semiparametric maximum likelihood approach is proposed which requires no parametric specification of the marginal covariate distribution. By drawing from the von Mises calculus and V-Statistics theory, we obtain an asymptotic linear representation of the semiparametric maximum likelihood estimator (SMLE) of the regression parameters, which allows for a consistent estimator of asymptotic variance. An EM algorithm for computation is then developed to implement the proposed method using fractional imputation. Simulation results suggest that the SMLE method is robust, whereas the fully parametric method is subject to severe bias under model misspecification. A rangeland study from the National Resources Inventory (NRI) is used to illustrate the practical use of the proposed methodology.}, number={1}, journal={Statistica Sinica}, publisher={Institute of Statistical Science}, author={Yang, Shu and Kim, Jae Kwang}, year={2017}, pages={261–285} } @article{yang_kim_2017, title={God, devil and guru in the land of multiple imputation discussion}, volume={27}, number={4}, journal={Statistica Sinica}, author={Yang, S. and Kim, J. K.}, year={2017}, pages={1568–1573} } @article{yang_lok_2016, title={A goodness-of-fit test for structural nested mean models}, volume={103}, ISSN={0006-3444 1464-3510}, url={http://dx.doi.org/10.1093/biomet/asw031}, DOI={10.1093/biomet/asw031}, abstractNote={Abstract Coarse structural nested mean models are tools for estimating treatment effects from longitudinal observational data with time-dependent confounding. There is, however, no guidance on how to specify the treatment effect model, and model misspecification can lead to bias. We derive a goodness-of-fit test based on modified over-identification restrictions tests for evaluating a treatment effect model, and show that our test is doubly robust in the sense that, with a correct treatment effect model, the test has the correct Type I error if either the treatment initiation model or a nuisance regression outcome model is correctly specified. In a simulation study, we show that the test has correct Type I error and can detect model misspecification. We use the test to study how the timing of antiretroviral treatment initiation after HIV infection predicts the effect of one year of treatment in HIV-positive patients with acute and early infection.}, number={3}, journal={Biometrika}, publisher={Oxford University Press (OUP)}, author={Yang, S. and Lok, J. J.}, year={2016}, month={Jul}, pages={734–741} } @article{yang_kim_2016, title={A note on multiple imputation for method of moments estimation}, volume={103}, ISSN={0006-3444 1464-3510}, url={http://dx.doi.org/10.1093/biomet/asv073}, DOI={10.1093/biomet/asv073}, abstractNote={Multiple imputation is widely used for estimation in situations where there are missing data. Rubin (1987) provided an easily applicable formula for multiple imputation variance estimation, but its validity requires the congeniality condition of Meng (1994), which may not be satisfied for method of moments estimation. We give the asymptotic bias of Rubin's variance estimator when method of moments estimation is used in the complete-sample analysis for each imputed dataset. A new variance estimator based on over-imputation is proposed to provide asymptotically valid inference in this case.}, number={1}, journal={Biometrika}, publisher={Oxford University Press (OUP)}, author={Yang, S. and Kim, J. K.}, year={2016}, month={Feb}, pages={244–251} } @article{yang_kim_2016, title={Fractional Imputation in Survey Sampling: A Comparative Review}, volume={31}, ISSN={0883-4237}, url={http://dx.doi.org/10.1214/16-sts569}, DOI={10.1214/16-sts569}, abstractNote={Fractional imputation (FI) is a relatively new method of imputation for handling item nonresponse in survey sampling. In FI, several imputed values with their fractional weights are created for each record with missing items. Each fractional weight represents the conditional probability of the imputed value given the observed data, and the parameters in the conditional probabilities are often computed by an iterative method such as the EM algorithm. The underlying model for FI can be fully parametric, semiparametric or nonparametric, depending on the plausibility of assumptions and the data structure. In this paper, we give an overview of FI, introduce key ideas and methods to readers who are new to the FI literature, and highlight some new developments. We also provide guidance on practical implementation of FI and valid inferential tools after imputation. We demonstrate the empirical performance of FI with respect to multiple imputation using a pseudo finite population generated from a sample from the Monthly Retail Trade Survey conducted by the US Census Bureau.}, number={3}, journal={Statistical Science}, publisher={Institute of Mathematical Statistics}, author={Yang, Shu and Kim, Jae Kwang}, year={2016}, month={Aug}, pages={415–432} } @inproceedings{oh_thuente_2016, title={Jamming and advanced modular-based blind rendezvous algorithms for cognitive radio networks}, DOI={10.1109/wowmom.2016.7523514}, abstractNote={Modular-based channel hopping (CH) rendezvous algorithms can provide guaranteed rendezvous for Cognitive Radio Networks (CRNs) without time synchronization or Common Control Channels (i.e., blind rendezvous). Recently, the Enhanced Jump-Stay (EJS) scheme [13] has been proposed that decreases the Maximum Time To Rendezvous (MTTR) and the Expected Time To Rendezvous (ETTR) for users with a different number of channels (asymmetric). We develop a Symmetric Channel Detecting Jamming (SCDJ) attack and a novel probabilistic Asymmetric Channel Detecting Jamming (ACDJ) attack that dramatically decrease the rendezvous success rates of EJS. Our simulation results show that ACDJ significantly reduces the rendezvous probability of the asymmetric EJS scheme. We analyze the Random rendezvous scheme for the asymmetric model and show it vastly outperforms asymmetric EJS under ACDJ. We also developed the Random Enhanced Jump Stay (REJS) rendezvous that guarantees MTTR, significantly decreases the Expected Time To Rendezvous (ETTR) of EJS, and, due to the random part, appears to be resistant to channel detecting jamming attacks. REJS appears to be uniformly better than EJS with major performance improvements and significantly smaller theoretical ETTR upper-bounds and similar positive simulation results.}, booktitle={2016 IEEE 17th International Symposium on a World of Wireless, Mobile and Multimedia Networks (WOWMOM)}, author={Oh, Y. H. and Thuente, D. J.}, year={2016} } @article{yang_imbens_cui_faries_kadziola_2016, title={Propensity Score Matching and Subclassification in Observational Studies with Multi-Level Treatments}, volume={72}, ISSN={["1541-0420"]}, DOI={10.1111/biom.12505}, abstractNote={Summary}, number={4}, journal={BIOMETRICS}, author={Yang, Shu and Imbens, Guido W. and Cui, Zhanglin and Faries, Douglas E. and Kadziola, Zbigniew}, year={2016}, month={Dec}, pages={1055–1065} } @article{peyer_welk_bailey-davis_yang_kim_2015, title={Factors Associated with Parent Concern for Child Weight and Parenting Behaviors}, volume={11}, ISSN={2153-2168 2153-2176}, url={http://dx.doi.org/10.1089/chi.2014.0111}, DOI={10.1089/chi.2014.0111}, abstractNote={Abstract Background: A parent's perception about their child's overweight status is an important precursor or determinant of preventative actions. Acknowledgment of, and concern for, overweight may be moderated by the parent's own weight status whereas engaging in healthy behaviors at home may promote healthy weight status. It is hypothesized that normal weight parents are more likely to engage in healthy behaviors and acknowledge overweight in their own children whereas heavier parents may report more concern about child weight. Methods: A total of 1745 parents of first- through fifth-grade students completed a questionnaire assessing reactions to a school BMI report and perceptions about BMI issues. Specific items included perceptions of child's weight status, concern for child weight status, and preventive practices. Parents also provided information about their own weight status. Relationships between measured child weight, perceived child weight, parent weight, parent concern, and healthy behaviors were examined. Results: Overweight parents were more likely to identify overweight in their child and report concern about their child's weight. Concern was higher for parents of overweight children than of normal weight children. Normal weight parents and parents of normal weight children reported more healthy behaviors. Conclusions: Results support the hypothesis that normal weight parents are more likely to engage in healthy behaviors and that overweight parents are more likely to report concern about child weight. However, overweight parents are also more likely to acknowledge overweight status in their own child. Future research should examine links between parent concern and actual pursuit of weight management assistance.}, number={3}, journal={Childhood Obesity}, publisher={Mary Ann Liebert Inc}, author={Peyer, Karissa L. and Welk, Gregory and Bailey-Davis, Lisa and Yang, Shu and Kim, Jae-Kwang}, year={2015}, month={Jun}, pages={269–274} } @article{yang_kim_2015, title={Likelihood-based Inference with Missing Data Under Missing-at-Random}, volume={43}, ISSN={0303-6898}, url={http://dx.doi.org/10.1111/sjos.12184}, DOI={10.1111/sjos.12184}, abstractNote={Abstract}, number={2}, journal={Scandinavian Journal of Statistics}, publisher={Wiley}, author={Yang, Shu and Kim, Jae Kwang}, year={2015}, month={Oct}, pages={436–454} } @book{yang_zhu_2015, title={Semiparametric estimation of spectral density function for irregular spatial data}, url={https://arxiv.org/abs/1508.06886}, author={Yang, S. and Zhu, Z.}, year={2015} } @article{kim_yang_2014, title={Fractional hot deck imputation for robust inference under item nonresponse in survey sampling}, volume={40}, url={https://lib.dr.iastate.edu/stat_las_pubs/116/}, number={2}, journal={Survey Methodology}, author={Kim, J.K. and Yang, S.}, year={2014}, month={Dec}, pages={211–230} } @article{yang_zhu_2014, title={Variance estimation and kriging prediction for a class of non-stationary spatial models}, ISSN={1017-0405}, url={http://dx.doi.org/10.5705/ss.2013.205w}, DOI={10.5705/ss.2013.205w}, abstractNote={This paper discusses the estimation and plug-in kriging prediction non-stationary spatial process assuming a smoothly varying variance an additive independent measurement error. A difference-based kernel estimator of the variance function and a modified likelihood estimator of the mea surement error variance are used for parameter estimation. Asymptotic properties of these estimators and the plug-in kriging predictor are established. A simula tion study is presented to test our estimation-prediction procedure. Our kriging predictor is shown to perform better than the spatial adaptive local polynomial regression estimator proposed by Fan and Gijbels (1995) when the measurement error is small.}, journal={Statistica Sinica}, publisher={Institute of Statistical Science}, author={Yang, Shu and Zhu, Zhengyuan}, year={2014} } @inproceedings{kim_zhu_yang_2013, title={Improved estimation for June Area Survey incorporating several information}, booktitle={Proceedings 59th ISI World Statistics Congress}, author={Kim, J.K. and Zhu, Z. and Yang, S.}, year={2013}, pages={199–204} } @article{yang_kim_shin_2013, title={Imputation methods for quantile estimation under missing at random}, volume={6}, ISSN={1938-7989 1938-7997}, url={http://dx.doi.org/10.4310/sii.2013.v6.n3.a7}, DOI={10.4310/sii.2013.v6.n3.a7}, abstractNote={Imputation is frequently used to handle missing data for which multiple imputation is a popular technique. We propose a fractional hot deck imputation which produces a valid variance estimator for quantiles. In the proposed method, the imputed values are chosen from the set of respondents and are assigned with proper fractional weights that use a density function for the working model. In addition, we consider a nonparametric fractional imputation method based on nonparametric kernel regression, avoiding a parametric distribution assumption and thus giving more robustness. The resulting estimator can be called nonparametric fractionally imputation estimator. Valid variance estimation is also discussed. A limited simulation study compares the proposed methods favorably with other existing methods.}, number={3}, journal={Statistics and Its Interface}, publisher={International Press of Boston}, author={Yang, Shu and Kim, Jae-Kwang and Shin, Dong Wan}, year={2013}, pages={369–377} } @article{yang_kim_zhu_2013, title={Parametric fractional imputation for mixed models with nonignorable missing data}, volume={6}, ISSN={1938-7989 1938-7997}, url={http://dx.doi.org/10.4310/sii.2013.v6.n3.a4}, DOI={10.4310/sii.2013.v6.n3.a4}, abstractNote={Inference in the presence of non-ignorable missing data is a widely encountered and difficult problem in statistics. Imputation is often used to facilitate parameter estimation, which allows one to use the complete sample estimators on the imputed data set. We develop a parametric fractional imputation (PFI) method proposed by Kim (2011), which simplifies the computation associated with the EM algorithm for maximum likelihood estimation with missing data. We first consider the problem of parameter estimation for linear mixed models with non-ignorable missing values, which assumes that missingness depends on the missing values only through the random effects, leading to shared parameter models (Follmann and Wu, 1995). In the M-step, the restricted or adjusted profiled maximum likelihood method is used to reduce the bias of maximum likelihood estimation of the variance components. Results from a limited simulation study are presented to compare the proposed method with the existing methods, which demonstrates that imputation can significantly reduce the nonresponse bias and the idea of adjusted profiled maximum likelihood works nicely in PFI for the bias correction in estimating the variance components. Variance estimation is also discussed. We next extend PFI to generalized linear mixed model and the flexibility of this method is illustrated by analyzing the infamous salamander mating data (McCullagh and Nelder, 1989).}, number={3}, journal={Statistics and Its Interface}, publisher={International Press of Boston}, author={Yang, Shu and Kim, Jae-Kwang and Zhu, Zhengyuan}, year={2013}, pages={339–347} } @book{larsen_yang_rappold_reich, title={A spatial causal analysis of wildland fire- contributed PM2.5 using numerical model output}, url={https://arxiv.org/pdf/2003.06037}, author={Larsen, A. and Yang, S. and Rappold, A. and Reich, B.} } @book{guan_yang, title={A unified framework for causal inference with multiple imputation using martingale}, url={https://arxiv.org/pdf/1911.04663}, author={Guan, Q. and Yang, S.} } @book{tang_yang_wang_cui_li_faries, title={Causal inference of hazard ratio based on propensity score matching}, url={https://arxiv.org/pdf/1911.12430}, author={Tang, S. and Yang, S. and Wang, T. and Cui, Z. and Li, L. and Faries, D.} }