@article{cosco_wells_zhang_goodell_monsur_xu_moore_2022, title={Hands-on childcare garden intervention: A randomized controlled trial to assess effects on fruit and vegetable identification, liking, and consumption among children aged 3-5 years in North Carolina}, volume={13}, ISSN={["1664-1078"]}, DOI={10.3389/fpsyg.2022.993637}, abstractNote={Gardening at childcare centers may have a potent influence on young children’s learning about fruits and vegetables and their development of healthy dietary behaviors. This randomized controlled trial examined the effect of a garden intervention on fruit and vegetable (FV) identification, FV liking, and FV consumption among 3–5-year-old children enrolled in childcare centers in Wake County, North Carolina, USA. Eligible childcare centers (serving primarily low-income families) were randomly selected and then randomly assigned to one of three groups: (1) intervention; (2) waitlist-control that served as a control in year 1 and received the intervention in year 2; or (3) no-intervention control. From the 15 participating childcare centers, 285 children aged 3–5 years were consented by their parents or guardians to participate. The intervention comprised six standardized, raised, mulched garden beds, planted with warm-season annual vegetables and fruits, and perennial fruits. A Gardening Activity Guide describing 12 age-appropriate, sequential gardening activities was distributed for teachers to lead hands-on gardening activities during the growing season. Data were gathered between Spring 2018 and Fall 2019. FV identification and liking were measured using an age-appropriate tablet-enabled protocol. FV consumption was measured by weighing each child’s fruit and vegetable snack tray before and after tasting sessions. Compared to children receiving no-intervention, children who received the garden intervention showed a greater increase in accurate identification of both fruits and vegetables as well as consumption of both fruit and vegetables during the tasting sessions. Consistent with prior research, the effects on fruit consumption were greater than on vegetable consumption. There was no significant effect of the garden intervention on children’s FV liking. Garden interventions implemented early in life foster learning about FV and promote healthy eating. Early exposure to gardening may yield a return on investment throughout the lifecourse, impacting healthy diet and associated health outcomes, which are particularly important within disadvantaged communities where children’s health is challenged by a host of risk factors. Clinical Trials Registration #NCT04864574 ( clinicaltrials.gov ).}, journal={FRONTIERS IN PSYCHOLOGY}, author={Cosco, Nilda G. G. and Wells, Nancy M. M. and Zhang, Daowen and Goodell, L. Suzanne and Monsur, Muntazar and Xu, Tong and Moore, Robin C. C.}, year={2022}, month={Nov} } @article{cosco_wells_monsur_goodell_zhang_xu_hales_moore_2021, title={Research Design, Protocol, and Participant Characteristics of COLEAFS: A Cluster Randomized Controlled Trial of a Childcare Garden Intervention}, volume={18}, ISSN={["1660-4601"]}, DOI={10.3390/ijerph182413066}, abstractNote={Childcare garden interventions may be an effective strategy to increase fruit and vegetable (FV) consumption and physical activity among young children. The objective of this paper is to describe the research design, protocol, outcome measures, and baseline characteristics of participants in the Childcare Outdoor Learning Environments as Active Food Systems ("COLEAFS") study, a cluster randomized controlled trial (RCT) examining the effect of a garden intervention on outcomes related to diet and physical activity. Fifteen childcare centers in low-income areas were randomly assigned to intervention (to receive garden intervention in Year 1), waitlist control (to receive garden intervention in Year 2), and control group (no intervention). The garden intervention comprised six raised beds planted with warm-season vegetables and fruits, and a garden activity booklet presenting 12 gardening activities. FV knowledge and FV liking were measured using a tablet-enabled protocol. FV consumption was measured by weighing FV before and after a snack session. Physical activity was measured using Actigraph GT3x+ worn by children for three consecutive days while at the childcare center. Of the 543 eligible children from the 15 childcare centers, 250 children aged 3-5 years received parental consent, assented, and participated in baseline data collection. By employing an RCT to examine the effect of a garden intervention on diet and physical activity among young children attending childcare centers within low-income communities, this study offers compelling research design and methods, addresses a critical gap in the empirical literature, and is a step toward evidence-based regulations to promote early childhood healthy habits.}, number={24}, journal={INTERNATIONAL JOURNAL OF ENVIRONMENTAL RESEARCH AND PUBLIC HEALTH}, author={Cosco, Nilda Graciela and Wells, Nancy M. and Monsur, Muntazar and Goodell, Lora Suzanne and Zhang, Daowen and Xu, Tong and Hales, Derek and Moore, Robin Clive}, year={2021}, month={Dec} } @misc{liu_shih_strawderman_zhang_johnson_chai_2019, title={Statistical Analysis of Zero-Inflated Nonnegative Continuous Data: A Review}, volume={34}, ISSN={["2168-8745"]}, DOI={10.1214/18-STS681}, abstractNote={Zero-inflated nonnegative continuous (or semicontinuous) data arise frequently in biomedical, economical, and ecological studies. Examples include substance abuse, medical costs, medical care utilization, biomarkers (e.g., CD4 cell counts, coronary artery calcium scores), single cell gene expression rates, and (relative) abundance of microbiome. Such data are often characterized by the presence of a large portion of zero values and positive continuous values that are skewed to the right and heteroscedastic. Both of these features suggest that no simple parametric distribution may be suitable for modeling such type of outcomes. In this paper, we review statistical methods for analyzing zero-inflated nonnegative outcome data. We will start with the cross-sectional setting, discussing ways to separate zero and positive values and introducing flexible models to characterize right skewness and heteroscedasticity in the positive values. We will then present models of correlated zero-inflated nonnegative continuous data, using random effects to tackle the correlation on repeated measures from the same subject and that across different parts of the model. We will also discuss expansion to related topics, for example, zero-inflated count and survival data, nonlinear covariate effects, and joint models of longitudinal zero-inflated nonnegative continuous data and survival. Finally, we will present applications to three real datasets (i.e., microbiome, medical costs, and alcohol drinking) to illustrate these methods. Example code will be provided to facilitate applications of these methods.}, number={2}, journal={STATISTICAL SCIENCE}, author={Liu, Lei and Shih, Ya-Chen Tina and Strawderman, Robert L. and Zhang, Daowen and Johnson, Bankole A. and Chai, Haitao}, year={2019}, month={May}, pages={253–279} } @article{chen_liu_shih_zhang_severini_2016, title={A flexible model for correlated medical costs, with application to medical expenditure panel survey data}, volume={35}, ISSN={["1097-0258"]}, DOI={10.1002/sim.6743}, abstractNote={We propose a flexible model for correlated medical cost data with several appealing features. First, the mean function is partially linear. Second, the distributional form for the response is not specified. Third, the covariance structure of correlated medical costs has a semiparametric form. We use extended generalized estimating equations to simultaneously estimate all parameters of interest. B-splines are used to estimate unknown functions, and a modification to Akaike information criterion is proposed for selecting knots in spline bases. We apply the model to correlated medical costs in the Medical Expenditure Panel Survey dataset. Simulation studies are conducted to assess the performance of our method. Copyright © 2015 John Wiley & Sons, Ltd.}, number={6}, journal={STATISTICS IN MEDICINE}, author={Chen, Jinsong and Liu, Lei and Shih, Ya-Chen T. and Zhang, Daowen and Severini, Thomas A.}, year={2016}, month={Mar}, pages={883–894} } @article{zhang_sun_pieper_2016, title={Bivariate Mixed Effects Analysis of Clustered Data with Large Cluster Sizes}, volume={8}, ISSN={["1867-1772"]}, DOI={10.1007/s12561-015-9140-x}, abstractNote={Linear mixed effects models are widely used to analyze a clustered response variable. Motivated by a recent study to examine and compare the hospital length of stay (LOS) between patients undertaking percutaneous coronary intervention (PCI) and coronary artery bypass graft (CABG) from several international clinical trials, we proposed a bivariate linear mixed effects model for the joint modeling of clustered PCI and CABG LOS's where each clinical trial is considered a cluster. Due to the large number of patients in some trials, commonly used commercial statistical software for fitting (bivariate) linear mixed models failed to run since it could not allocate enough memory to invert large dimensional matrices during the optimization process. We consider ways to circumvent the computational problem in the maximum likelihood (ML) inference and restricted maximum likelihood (REML) inference. Particularly, we developed an expected and maximization (EM) algorithm for the REML inference and presented an ML implementation using existing software. The new REML EM algorithm is easy to implement and computationally stable and efficient. With this REML EM algorithm, we could analyze the LOS data and obtained meaningful results.}, number={2}, journal={STATISTICS IN BIOSCIENCES}, author={Zhang, Daowen and Sun, Jie Lena and Pieper, Karen}, year={2016}, month={Oct}, pages={220–233} } @article{bernhardt_zhang_wang_2015, title={A fast EM algorithm for fitting joint models of a binary response and multiple longitudinal covariates subject to detection limits}, volume={85}, ISSN={["1872-7352"]}, DOI={10.1016/j.csda.2014.11.011}, abstractNote={Joint modeling techniques have become a popular strategy for studying the association between a response and one or more longitudinal covariates. Motivated by the GenIMS study, where it is of interest to model the event of survival using censored longitudinal biomarkers, a joint model is proposed for describing the relationship between a binary outcome and multiple longitudinal covariates subject to detection limits. A fast, approximate EM algorithm is developed that reduces the dimension of integration in the E-step of the algorithm to one, regardless of the number of random effects in the joint model. Numerical studies demonstrate that the proposed approximate EM algorithm leads to satisfactory parameter and variance estimates in situations with and without censoring on the longitudinal covariates. The approximate EM algorithm is applied to analyze the GenIMS data set.}, journal={COMPUTATIONAL STATISTICS & DATA ANALYSIS}, author={Bernhardt, Paul W. and Zhang, Daowen and Wang, Huixia Judy}, year={2015}, month={May}, pages={37–53} } @article{zhao_marceau_zhang_tzeng_2015, title={Assessing gene-environment interactions for common and rare variants with binary traits using gene-trait similarity regression}, volume={199}, number={3}, journal={Genetics}, author={Zhao, G. L. and Marceau, R. and Zhang, D. W. and Tzeng, J. Y.}, year={2015}, pages={695-} } @article{bernhardt_wang_zhang_2015, title={Statistical Methods for Generalized Linear Models with Covariates Subject to Detection Limits}, volume={7}, ISSN={1867-1764 1867-1772}, url={http://dx.doi.org/10.1007/S12561-013-9099-4}, DOI={10.1007/S12561-013-9099-4}, abstractNote={Censored observations are a common occurrence in biomedical data sets. Although a large amount of research has been devoted to estimation and inference for data with censored responses, very little research has focused on proper statistical procedures when predictors are censored. In this paper, we consider statistical methods for dealing with multiple predictors subject to detection limits within the context of generalized linear models. We investigate and adapt several conventional methods and develop a new multiple imputation approach for analyzing data sets with predictors censored due to detection limits. We establish the consistency and asymptotic normality of the proposed multiple imputation estimator and suggest a computationally simple and consistent variance estimator. We also demonstrate that the conditional mean imputation method often leads to inconsistent estimates in generalized linear models, while several other methods are either computationally intensive or lead to parameter estimates that are biased or more variable compared to the proposed multiple imputation estimator. In an extensive simulation study, we assess the bias and variability of different approaches within the context of a logistic regression model and compare variance estimation methods for the proposed multiple imputation estimator. Lastly, we apply several methods to analyze the data set from a recently-conducted GenIMS study.}, number={1}, journal={Statistics in Biosciences}, publisher={Springer Science and Business Media LLC}, author={Bernhardt, Paul W. and Wang, Huixia J. and Zhang, Daowen}, year={2015}, month={May}, pages={68–89} } @article{bernhardt_wang_zhang_2014, title={Flexible modeling of survival data with covariates subject to detection limits via multiple imputation}, volume={69}, ISSN={["1872-7352"]}, DOI={10.1016/j.csda.2013.07.027}, abstractNote={Models for survival data generally assume that covariates are fully observed. However, in medical studies it is not uncommon for biomarkers to be censored at known detection limits. A computationally-efficient multiple imputation procedure for modeling survival data with covariates subject to detection limits is proposed. This procedure is developed in the context of an accelerated failure time model with a flexible seminonparametric error distribution. The consistency and asymptotic normality of the multiple imputation estimator are established and a consistent variance estimator is provided. An iterative version of the proposed multiple imputation algorithm that approximates the EM algorithm for maximum likelihood is also suggested. Simulation studies demonstrate that the proposed multiple imputation methods work well while alternative methods lead to estimates that are either biased or more variable. The proposed methods are applied to analyze the dataset from a recently-conducted GenIMS study.}, journal={COMPUTATIONAL STATISTICS & DATA ANALYSIS}, author={Bernhardt, Paul W. and Wang, Huixia Judy and Zhang, Daowen}, year={2014}, month={Jan}, pages={81–91} } @article{wang_zhang_tzeng_2014, title={Pathway-Guided Identification of Gene-Gene Interactions}, volume={78}, ISSN={["1469-1809"]}, DOI={10.1111/ahg.12080}, abstractNote={Assessing gene-gene interactions (GxG) at the gene level can permit examination of epistasis at biologically functional units with amplified interaction signals from marker-marker pairs. While current gene-based GxG methods tend to be designed for two or a few genes, for complex traits, it is often common to have a list of many candidate genes to explore GxG. We propose a regression model with pathway-guided regularization for detecting interactions among genes. Specifically, we use the principal components to summarize the SNP-SNP interactions between a gene pair, and use an L1 penalty that incorporates adaptive weights based on biological guidance and trait supervision to identify important main and interaction effects. Our approach aims to combine biological guidance and data adaptiveness, and yields credible findings that may be likely to shed insights in order to formulate biological hypotheses for further molecular studies. The proposed approach can be used to explore the GxG with a list of many candidate genes and is applicable even when sample size is smaller than the number of predictors studied. We evaluate the utility of the proposed method using simulation and real data analysis. The results suggest improved performance over methods not utilizing pathway and trait guidance.}, number={6}, journal={ANNALS OF HUMAN GENETICS}, author={Wang, Xin and Zhang, Daowen and Tzeng, Jung-Ying}, year={2014}, month={Nov}, pages={478–491} } @article{chen_liu_zhang_shih_2013, title={A flexible model for the mean and variance functions, with application to medical cost data}, volume={32}, ISSN={["1097-0258"]}, DOI={10.1002/sim.5838}, abstractNote={Medical cost data are often skewed to the right and heteroscedastic, having a nonlinear relation with covariates. To tackle these issues, we consider an extension to generalized linear models by assuming nonlinear associations of covariates in the mean function and allowing the variance to be an unknown but smooth function of the mean. We make no further assumption on the distributional form. The unknown functions are described by penalized splines, and the estimation is carried out using nonparametric quasi-likelihood. Simulation studies show the flexibility and advantages of our approach. We apply the model to the annual medical costs of heart failure patients in the clinical data repository at the University of Virginia Hospital System. Copyright © 2013 John Wiley & Sons, Ltd.}, number={24}, journal={STATISTICS IN MEDICINE}, author={Chen, Jinsong and Liu, Lei and Zhang, Daowen and Shih, Ya-Chen T.}, year={2013}, month={Oct}, pages={4306–4318} } @inbook{torres_zhang_wang_2013, title={Constructing Conditional Reference Charts for Grip Strength Measured with Error}, ISBN={9781461478454 9781461478461}, ISSN={2194-1009 2194-1017}, url={http://dx.doi.org/10.1007/978-1-4614-7846-1_24}, DOI={10.1007/978-1-4614-7846-1_24}, abstractNote={Muscular strength, usually quantified through the grip strength, can be used in humans and animals as an indicator of neuromuscular function or to assess hand function in patients with trauma or congenital problems. Because grip strength cannot be accurately measured, several contaminated measurements are often taken on the same subject. A research interest in grip strength studies is estimating the conditional quantiles of the latent grip strength, which can be used to construct conditional grip strength charts. Current work in the literature often applies conventional quantile regression method using the subject-specific average of the repeated measurements as the response variable. We show that this approach suffers from model misspecification and often leads to biased estimates of the conditional quantiles of the latent grip strength. We propose a new semi-nonparametric estimation approach, which is able to account for measurement errors and allows the subject-specific random effects to follow a flexible distribution. We demonstrate through simulation studies that the proposed method leads to consistent and efficient estimates of the conditional quantiles of the latent response variable. The value of the proposed method is assessed by analyzing a grip strength data set on laboratory mice.}, booktitle={Springer Proceedings in Mathematics & Statistics}, publisher={Springer New York}, author={Torres, Pedro A. and Zhang, Daowen and Wang, Huixia Judy}, year={2013}, pages={299–310} } @article{yan_zhang_lu_grifo_liu_2012, title={A Semi-nonparametric Approach to Joint Modeling of A Primary Binary Outcome and Longitudinal Data Measured at Discrete Informative Times}, volume={4}, ISSN={1867-1764 1867-1772}, url={http://dx.doi.org/10.1007/S12561-011-9053-2}, DOI={10.1007/S12561-011-9053-2}, abstractNote={In a study conducted at the New York University Fertility Center, one of the scientific objectives is to investigate the relationship between the final pregnancy outcomes of participants receiving an in vitro fertilization (IVF) treatment and their β-human chorionic gonadotrophin (β-hCG) profiles. A common joint modeling approach to this objective is to use subject-specific normal random effects in a linear mixed model for longitudinal β-hCG data as predictors in a model (e.g., logistic model) for the final pregnancy outcome. Empirical data exploration indicates that the observation times for longitudinal β-hCG data may be informative and the distribution of random effects for longitudinal β-hCG data may not be normally distributed. We propose to introduce a third model in the joint model for the informative β-hCG observation times, and relax the normality distributional assumption of random effects using the semi-nonparametric (SNP) approach of Gallant and Nychka (Econometrica 55:363–390, 1987). An EM algorithm is developed for parameter estimation. Extensive simulation designed to evaluate the proposed method indicates that ignoring either informative observation times or distributional assumption of the random effects would lead to invalid and/or inefficient inference. Applying our new approach to the data reveals some interesting findings the traditional approach failed to discover.}, number={2}, journal={Statistics in Biosciences}, publisher={Springer Science and Business Media LLC}, author={Yan, Song and Zhang, Daowen and Lu, Wenbin and Grifo, James A. and Liu, Mengling}, year={2012}, month={Jan}, pages={213–234} } @article{chen_johnson_wang_o'quigley_isaac_zhang_liu_2012, title={Trajectory Analyses in Alcohol Treatment Research}, volume={36}, ISSN={["0145-6008"]}, DOI={10.1111/j.1530-0277.2012.01748.x}, abstractNote={Various statistical methods have been used for data analysis in alcohol treatment studies. Trajectory analyses can better capture differences in treatment effects and may provide insight on the optimal duration of future clinical trials and grace periods. This improves on the limitation of commonly used parametric (e.g., linear) methods that cannot capture nonlinear temporal trends in the data.We propose an exploratory approach, using more flexible smoothing mixed effects models, more accurately to characterize the temporal patterns of the drinking data. We estimated the trajectories of the treatment arms for data sets from 2 sources: a multisite topiramate study, and the Combined Pharmacotherapies (acamprosate and naltrexone) and Behavioral Interventions study.Our methods illustrate that drinking outcomes of both the topiramate and placebo arms declined over the entire course of the trial but with a greater rate of decline for the topiramate arm. By the point-wise confidence intervals, the heavy drinking probabilities for the topiramate arm might differ from those of the placebo arm as early as week 2. Furthermore, the heavy drinking probabilities of both arms seemed to stabilize at the end of the study. Overall, naltrexone was better than placebo in reducing drinking over time yet was not different from placebo for subjects receiving the combination of a brief medical management and an intensive combined behavioral intervention.The estimated trajectory plots clearly showed nonlinear temporal trends of the treatment with different medications on drinking outcomes and offered more detailed interpretation of the results. This trajectory analysis approach is proposed as a valid exploratory method for evaluating efficacy in pharmacotherapy trials in alcoholism.}, number={8}, journal={ALCOHOLISM-CLINICAL AND EXPERIMENTAL RESEARCH}, author={Chen, Jinsong and Johnson, Bankole A. and Wang, Xin-Qun and O'Quigley, John and Isaac, Maria and Zhang, Daowen and Liu, Lei}, year={2012}, month={Aug}, pages={1442–1448} } @article{tzeng_zhang_pongpanich_smith_mccarthy_sale_worrall_hsu_thomas_sullivan_2011, title={Studying Gene and Gene-Environment Effects of Uncommon and Common Variants on Continuous Traits: A Marker-Set Approach Using Gene-Trait Similarity Regression}, volume={89}, ISSN={["1537-6605"]}, DOI={10.1016/j.ajhg.2011.07.007}, abstractNote={Genomic association analyses of complex traits demand statistical tools that are capable of detecting small effects of common and rare variants and modeling complex interaction effects and yet are computationally feasible. In this work, we introduce a similarity-based regression method for assessing the main genetic and interaction effects of a group of markers on quantitative traits. The method uses genetic similarity to aggregate information from multiple polymorphic sites and integrates adaptive weights that depend on allele frequencies to accomodate common and uncommon variants. Collapsing information at the similarity level instead of the genotype level avoids canceling signals that have the opposite etiological effects and is applicable to any class of genetic variants without the need for dichotomizing the allele types. To assess gene-trait associations, we regress trait similarities for pairs of unrelated individuals on their genetic similarities and assess association by using a score test whose limiting distribution is derived in this work. The proposed regression framework allows for covariates, has the capacity to model both main and interaction effects, can be applied to a mixture of different polymorphism types, and is computationally efficient. These features make it an ideal tool for evaluating associations between phenotype and marker sets defined by linkage disequilibrium (LD) blocks, genes, or pathways in whole-genome analysis. Genomic association analyses of complex traits demand statistical tools that are capable of detecting small effects of common and rare variants and modeling complex interaction effects and yet are computationally feasible. In this work, we introduce a similarity-based regression method for assessing the main genetic and interaction effects of a group of markers on quantitative traits. The method uses genetic similarity to aggregate information from multiple polymorphic sites and integrates adaptive weights that depend on allele frequencies to accomodate common and uncommon variants. Collapsing information at the similarity level instead of the genotype level avoids canceling signals that have the opposite etiological effects and is applicable to any class of genetic variants without the need for dichotomizing the allele types. To assess gene-trait associations, we regress trait similarities for pairs of unrelated individuals on their genetic similarities and assess association by using a score test whose limiting distribution is derived in this work. The proposed regression framework allows for covariates, has the capacity to model both main and interaction effects, can be applied to a mixture of different polymorphism types, and is computationally efficient. These features make it an ideal tool for evaluating associations between phenotype and marker sets defined by linkage disequilibrium (LD) blocks, genes, or pathways in whole-genome analysis.}, number={2}, journal={AMERICAN JOURNAL OF HUMAN GENETICS}, author={Tzeng, Jung-Ying and Zhang, Daowen and Pongpanich, Monnat and Smith, Chris and McCarthy, Mark I. and Sale, Michele M. and Worrall, Bradford B. and Hsu, Fang-Chi and Thomas, Duncan C. and Sullivan, Patrick F.}, year={2011}, month={Aug}, pages={277–288} } @article{munana_zhang_patterson_2010, title={Placebo Effect in Canine Epilepsy Trials}, volume={24}, ISSN={["0891-6640"]}, url={http://www.scopus.com/inward/record.url?eid=2-s2.0-73349107805&partnerID=MN8TOARS}, DOI={10.1111/j.1939-1676.2009.0407.x}, abstractNote={The placebo effect is a well-recognized phenomenon in human medicine; in contrast, little information exists on the effect of placebo administration in veterinary patients.Nonpharmacologic therapeutic effects play a role in response rates identified in canine epilepsy trials.Thirty-four dogs with epilepsy.Meta-analysis of the 3 known prospective, placebo-controlled canine epilepsy trials. The number of seizures per week was compiled for each dog throughout their participation in the trial. Log-linear models were developed to evaluate seizure frequency during treatment and placebo relative to baseline.Twenty-two of 28 (79%) dogs in the study that received placebo demonstrated a decrease in seizure frequency compared with baseline, and 8 (29%) could be considered responders, with a 50% or greater reduction in seizures. For the 3 trials evaluated, the average reduction in seizures during placebo administration relative to baseline was 26% (P = .0018), 29% (P = .17), and 46% (P = .01).A positive response to placebo administration, manifesting as a decrease in seizure frequency, can be observed in epileptic dogs. This is of importance when evaluating open label studies in dogs that aim to assess efficacy of antiepileptic drugs, as the reported results might be overstated. Findings from this study highlight the need for more placebo-controlled trials in veterinary medicine.}, number={1}, journal={JOURNAL OF VETERINARY INTERNAL MEDICINE}, author={Munana, K. R. and Zhang, D. and Patterson, E. E.}, year={2010}, pages={166–170} } @article{levine_zhang_harris_vaden_2010, title={The use of pooled vs serial urine samples to measure urine protein:creatinine ratios}, volume={39}, ISSN={["0275-6382"]}, DOI={10.1111/j.1939-165x.2009.00167.x}, abstractNote={Background: Evaluation of serial urine protein:creatinine (UPC) ratios is important in prognosticating chronic kidney disease and monitoring response to therapeutic interventions. Owing to random biologic variation in dogs with stable glomerular proteinuria, multiple determinations of UPC ratios often are recommended to reliably assess urine protein loss. This can be cost-prohibitive. Objective: The purpose of this study was to evaluate agreement between the mean of 3 UPC ratios obtained on 3 separate urine samples per dog and a single UPC ratio obtained when aliquots of the separate samples were pooled and analyzed as 1 sample. Methods: Three separate urine samples were collected from each of 25 dogs, both client-owned and members of a research colony. Protein and creatinine concentrations were measured in the supernatant of each sample using a biochemical analyzer, and the mean of the 3 UPC ratios was calculated. A 1.0 mL aliquot of each of the 3 samples from each dog was pooled to create a fourth sample for that dog, and the UPC ratio of the pooled sample was similarly determined. Agreement and correlation between the mean and pooled UPC ratios were assessed using Bland–Altman difference plots and regression analysis, respectively. Results: The UPC ratio in the pooled samples was highly correlated (r=.9998, P<.0001) with the mean UPC ratio of the 3 separate samples. Strong agreement between results was demonstrated; a UPC ratio from a pooled sample was at most ±20% different than the mean UPC ratio obtained from 3 separate samples. Conclusions: Measuring the UPC ratio in a pooled sample containing equal volumes of several different urine specimens from the same dog provides a reliable and cost-effective alternative to assessing multiple UPC ratios on several specimens from the same dog.}, number={1}, journal={VETERINARY CLINICAL PATHOLOGY}, author={LeVine, Dana N. and Zhang, Daowen and Harris, Tonya and Vaden, Shelly L.}, year={2010}, month={Mar}, pages={53–56} } @article{ni_zhang_zhang_2010, title={Variable Selection for Semiparametric Mixed Models in Longitudinal Studies}, volume={66}, ISSN={["1541-0420"]}, DOI={10.1111/j.1541-0420.2009.01240.x}, abstractNote={Summary We propose a double-penalized likelihood approach for simultaneous model selection and estimation in semiparametric mixed models for longitudinal data. Two types of penalties are jointly imposed on the ordinary log-likelihood: the roughness penalty on the nonparametric baseline function and a nonconcave shrinkage penalty on linear coefficients to achieve model sparsity. Compared to existing estimation equation based approaches, our procedure provides valid inference for data with missing at random, and will be more efficient if the specified model is correct. Another advantage of the new procedure is its easy computation for both regression components and variance parameters. We show that the double-penalized problem can be conveniently reformulated into a linear mixed model framework, so that existing software can be directly used to implement our method. For the purpose of model inference, we derive both frequentist and Bayesian variance estimation for estimated parametric and nonparametric components. Simulation is used to evaluate and compare the performance of our method to the existing ones. We then apply the new method to a real data set from a lactation study.}, number={1}, journal={BIOMETRICS}, author={Ni, Xiao and Zhang, Daowen and Zhang, Hao Helen}, year={2010}, month={Mar}, pages={79–88} } @article{ni_zhang_zhang_2009, title={Automatic model selection for partially linear models}, volume={100}, ISSN={["0047-259X"]}, DOI={10.1016/j.jmva.2009.06.009}, abstractNote={We propose and study a unified procedure for variable selection in partially linear models. A new type of double-penalized least squares is formulated, using the smoothing spline to estimate the nonparametric part and applying a shrinkage penalty on parametric components to achieve model parsimony. Theoretically we show that, with proper choices of the smoothing and regularization parameters, the proposed procedure can be as efficient as the oracle estimator [J. Fan, R. Li, Variable selection via nonconcave penalized likelihood and its oracle properties, Journal of American Statistical Association 96 (2001) 1348–1360]. We also study the asymptotic properties of the estimator when the number of parametric effects diverges with the sample size. Frequentist and Bayesian estimates of the covariance and confidence intervals are derived for the estimators. One great advantage of this procedure is its linear mixed model (LMM) representation, which greatly facilitates its implementation by using standard statistical software. Furthermore, the LMM framework enables one to treat the smoothing parameter as a variance component and hence conveniently estimate it together with other regression coefficients. Extensive numerical studies are conducted to demonstrate the effective performance of the proposed procedure.}, number={9}, journal={JOURNAL OF MULTIVARIATE ANALYSIS}, author={Ni, Xiao and Zhang, Hao Helen and Zhang, Daowen}, year={2009}, month={Oct}, pages={2100–2111} } @article{tzeng_zhang_chang_thomas_davidian_2009, title={Gene-Trait Similarity Regression for Multimarker-Based Association Analysis}, volume={65}, ISSN={0006-341X}, url={http://dx.doi.org/10.1111/j.1541-0420.2008.01176.x}, DOI={10.1111/j.1541-0420.2008.01176.x}, abstractNote={Summary We propose a similarity-based regression method to detect associations between traits and multimarker genotypes. The model regresses similarity in traits for pairs of “unrelated” individuals on their haplotype similarities, and detects the significance by a score test for which the limiting distribution is derived. The proposed method allows for covariates, uses phase-independent similarity measures to bypass the needs to impute phase information, and is applicable to traits of general types (e.g., quantitative and qualitative traits). We also show that the gene-trait similarity regression is closely connected with random effects haplotype analysis, although commonly they are considered as separate modeling tools. This connection unites the classic haplotype sharing methods with the variance-component approaches, which enables direct derivation of analytical properties of the sharing statistics even when the similarity regression model becomes analytically challenging.}, number={3}, journal={Biometrics}, publisher={Wiley}, author={Tzeng, Jung-Ying and Zhang, Daowen and Chang, Sheng-Mao and Thomas, Duncan C. and Davidian, Marie}, year={2009}, month={Feb}, pages={822–832} } @article{zhang_quan_2009, title={Power and sample size calculation for log-rank test with a time lag in treatment effect}, volume={28}, ISSN={["0277-6715"]}, DOI={10.1002/sim.3501}, abstractNote={The log-rank test is the most powerful non-parametric test for detecting a proportional hazards alternative and thus is the most commonly used testing procedure for comparing time-to-event distributions between different treatments in clinical trials. When the log-rank test is used for the primary data analysis, the sample size calculation should also be based on the test to ensure the desired power for the study. In some clinical trials, the treatment effect may not manifest itself right after patients receive the treatment. Therefore, the proportional hazards assumption may not hold. Furthermore, patients may discontinue the study treatment prematurely and thus may have diluted treatment effect after treatment discontinuation. If a patient's treatment termination time is independent of his/her time-to-event of interest, the termination time can be treated as a censoring time in the final data analysis. Alternatively, we may keep collecting time-to-event data until study termination from those patients who discontinued the treatment and conduct an intent-to-treat analysis by including them in the original treatment groups. We derive formulas necessary to calculate the asymptotic power of the log-rank test under this non-proportional hazards alternative for the two data analysis strategies. Simulation studies indicate that the formulas provide accurate power for a variety of trial settings. A clinical trial example is used to illustrate the application of the proposed methods. Copyright © 2009 John Wiley & Sons, Ltd.}, number={5}, journal={STATISTICS IN MEDICINE}, author={Zhang, Daomen and Quan, Hui}, year={2009}, month={Feb}, pages={864–879} } @article{sowers_eyvazzadeh_mcconnell_yosef_jannausch_zhang_harlow_randolph_2008, title={Anti-Mullerian hormone and inhibin B in the definition of ovarian aging and the menopause transition}, volume={93}, ISSN={["1945-7197"]}, DOI={10.1210/jc.2008-0567}, abstractNote={Context/Objective: The objective of the study was to determine whether anti-Mullerian hormone (AMH) and inhibin B are viable endocrine biomarkers for framing the menopause transition from initiation to the final menstrual period (FMP). Design: We assayed AMH, inhibin B, and FSH in 300 archival follicular phase specimens from 50 women with six consecutive annual visits commencing in 1993 when all women were in the pre- and perimenopausal menopause stages. Subsequently each woman had a documented FMP. The assay results were fitted as individual-woman profiles and then related to time to FMP and age at FMP as outcomes. Results: Based on annual values from six time points prior to the FMP, logAMH longitudinal profiles declined and were highly associated with a time point 5 yr prior to FMP [including both observed and values below detection (P < 0.0001 and P = 0.0001, respectively)]. Baseline AMH profiles were also associated with age at FMP (P = 0.035). Models of declining loginhibin B profiles (including both observed and values below detection) were associated with time to FMP (P < 0.0001 and P = 0.0003, respectively). There was no significant association of loginhibin B profiles with age at FMP. Conclusions: AMH, an endocrine marker that reflects the transition of resting primordial follicles to growing follicles, declined to a time point 5 yr prior to the FMP; this may represent a critical biological juncture in the menopause transition. Low and nondetectable levels inhibin B levels also were observed 4–5 yr prior to the FMP but were less predictive of time to FMP or age at FMP.}, number={9}, journal={JOURNAL OF CLINICAL ENDOCRINOLOGY & METABOLISM}, author={Sowers, MaryFran R. and Eyvazzadeh, Aimee D. and McConnell, Daniel and Yosef, Matheos and Jannausch, Mary L. and Zhang, Daowen and Harlow, Sioban and Randolph, John F., Jr.}, year={2008}, month={Sep}, pages={3478–3483} } @article{quan_zhang_zhang_devlamynck_2007, title={Analysis of a binary composite endpoint with missing data in components}, volume={26}, ISSN={["0277-6715"]}, DOI={10.1002/sim.2893}, abstractNote={Composite endpoints are often used in clinical trials in order to increase the overall event rates, reduce the sizes of the trials and achieve desired power. For example, in a trial to study the effect of a treatment on the prevention of venous thromboembolic events after a major orthopaedic surgery of the lower limbs, the primary endpoint is usually a composite endpoint consisting of any deep vein thrombosis identified by systematic venography of lower limbs, symptomatic and well-documented non-fatal pulmonary embolism, and death from all causes. Just as any endpoints, missing data can occur in the components of the composite endpoint. If a patient has missing data on some of the components but not all the components, this patient may not have complete data but partial data for the composite endpoint. To be consistent with the intention-to-treat principle, the patient should not be discarded from the analysis. In this research, we propose an approach for the analysis of a composite endpoint with missing data in components. The main idea is to first derive the probabilities of all possible study outcomes based on the appropriate model and then to construct the overall rate for the composite endpoint. Simulations are conducted to compare the approach with several naïve methods. A data example is used to illustrate the application of the approach.}, number={26}, journal={STATISTICS IN MEDICINE}, author={Quan, Hui and Zhang, Daowen and Zhang, Ji and Devlamynck, Laure}, year={2007}, month={Nov}, pages={4703–4718} } @article{tzeng_zhang_2007, title={Haplotype-based association analysis via variance-components score test}, volume={81}, ISSN={["0002-9297"]}, DOI={10.1086/521558}, abstractNote={Haplotypes provide a more informative format of polymorphisms for genetic association analysis than do individual single-nucleotide polymorphisms. However, the practical efficacy of haplotype-based association analysis is challenged by a trade-off between the benefits of modeling abundant variation and the cost of the extra degrees of freedom. To reduce the degrees of freedom, several strategies have been considered in the literature. They include (1) clustering evolutionarily close haplotypes, (2) modeling the level of haplotype sharing, and (3) smoothing haplotype effects by introducing a correlation structure for haplotype effects and studying the variance components (VC) for association. Although the first two strategies enjoy a fair extent of power gain, empirical evidence showed that VC methods may exhibit only similar or less power than the standard haplotype regression method, even in cases of many haplotypes. In this study, we report possible reasons that cause the underpowered phenomenon and show how the power of the VC strategy can be improved. We construct a score test based on the restricted maximum likelihood or the marginal likelihood function of the VC and identify its nontypical limiting distribution. Through simulation, we demonstrate the validity of the test and investigate the power performance of the VC approach and that of the standard haplotype regression approach. With suitable choices for the correlation structure, the proposed method can be directly applied to unphased genotypic data. Our method is applicable to a wide-ranging class of models and is computationally efficient and easy to implement. The broad coverage and the fast and easy implementation of this method make the VC strategy an effective tool for haplotype analysis, even in modern genomewide association studies. Haplotypes provide a more informative format of polymorphisms for genetic association analysis than do individual single-nucleotide polymorphisms. However, the practical efficacy of haplotype-based association analysis is challenged by a trade-off between the benefits of modeling abundant variation and the cost of the extra degrees of freedom. To reduce the degrees of freedom, several strategies have been considered in the literature. They include (1) clustering evolutionarily close haplotypes, (2) modeling the level of haplotype sharing, and (3) smoothing haplotype effects by introducing a correlation structure for haplotype effects and studying the variance components (VC) for association. Although the first two strategies enjoy a fair extent of power gain, empirical evidence showed that VC methods may exhibit only similar or less power than the standard haplotype regression method, even in cases of many haplotypes. In this study, we report possible reasons that cause the underpowered phenomenon and show how the power of the VC strategy can be improved. We construct a score test based on the restricted maximum likelihood or the marginal likelihood function of the VC and identify its nontypical limiting distribution. Through simulation, we demonstrate the validity of the test and investigate the power performance of the VC approach and that of the standard haplotype regression approach. With suitable choices for the correlation structure, the proposed method can be directly applied to unphased genotypic data. Our method is applicable to a wide-ranging class of models and is computationally efficient and easy to implement. The broad coverage and the fast and easy implementation of this method make the VC strategy an effective tool for haplotype analysis, even in modern genomewide association studies. Haplotypes of multiple SNPs are considered a more informative format of polymorphisms for genetic association analysis than single SNPs.1The International HapMap Consortium The International HapMap Project.Nature. 2003; 426: 789-796Crossref PubMed Scopus (4688) Google Scholar Haplotypes are more informative because they preserve the joint linkage disequilibrium (LD) structure among multiple adjacent markers.2Akey J Jin L Xiong M Haplotypes vs single marker linkage disequilibrium tests: what do we gain?.Eur J Hum Genet. 2001; 9: 291-300Crossref PubMed Scopus (351) Google Scholar Even when only tag SNPs are used, haplotypes serve as a proxy for unobserved SNPs and increase the predictive power for the genomic variation.3Pe’er I de Bakker PI Maller J Yelensky R Altshuler D Daly MJ Evaluating and improving power in whole-genome association studies using fixed marker sets.Nat Genet. 2006; 38: 663-667Crossref PubMed Scopus (238) Google Scholar, 4Zaitlen N Kang HM Eskin E Halperin E Leveraging the HapMap correlation structure in association studies.Am J Hum Genet. 2007; 80: 683-691Abstract Full Text Full Text PDF PubMed Scopus (53) Google Scholar However, in terms of practical efficacy, the power of haplotype-based association analysis is challenged by a trade-off between the benefits of modeling abundant variation and the cost of the extra degrees of freedom for modeling the multimarker variations. To avoid the curse of dimensionality encountered in haplotype association analysis, various strategies have been proposed in the literature. They include (1) clustering evolutionarily close haplotypes,5Seltman H Roeder K Devlin B Evolutionary-based association analysis using haplotype data.Genet Epidemiol. 2003; 25: 48-58Crossref PubMed Scopus (93) Google Scholar, 6Durrant C Zondervan KT Cardon LR Hunt S Deloukas P Morris AP Linkage disequilibrium mapping via cladistic analysis of single-nucleotide polymorphism haplotypes.Am J Hum Genet. 2004; 75: 35-43Abstract Full Text Full Text PDF PubMed Scopus (162) Google Scholar, 7Tzeng JY Evolutionary-based grouping of haplotypes in association analysis.Genet Epidemiol. 2005; 28: 220-231Crossref PubMed Scopus (35) Google Scholar, 8Tzeng JY Wang CH Kao JT Hsiao CK Regression-based association analysis with clustered haplotypes through use of genotypes.Am J Hum Genet. 2006; 78: 231-242Abstract Full Text Full Text PDF PubMed Scopus (69) Google Scholar (2) modeling the level of haplotype sharing instead of the haplotypes themselves,9McPeek MS Strahs A Assessment of linkage disequilibrium by the decay of haplotype sharing, with application to fine-scale genetic mapping.Am J Hum Genet. 1999; 65: 858-875Abstract Full Text Full Text PDF PubMed Scopus (177) Google Scholar, 10der Meulen MAV te Meerman GJ Haplotype sharing analysis in affected individuals from nuclear families with at least one affected offspring.Genet Epidemiol. 1997; 14: 915-920Crossref PubMed Scopus (44) Google Scholar, 11Tzeng JY Devlin B Wasserman L Roeder K On the identification of disease mutations by the analysis of haplotype similarity and goodness of fit.Am J Hum Genet. 2003; 72: 891-902Abstract Full Text Full Text PDF PubMed Scopus (111) Google Scholar and (3) smoothing haplotype effects by introducing a correlation structure for the effects of similar haplotypes.12Thomas DC Morrison JL Clayton DG Bayes estimates of haplotype effects.Genet Epidemiol. 2001; 21: S712-S717PubMed Google Scholar, 13Molitor J Marjoram P Thomas D Application of Bayesian spatial statistical methods to analysis of haplotypes effects and gene mapping.Genet Epidemiol. 2003; 25: 95-105Crossref PubMed Scopus (33) Google Scholar, 14Schaid DJ Evaluating associations of haplotypes with traits.Genet Epidemiol. 2004; 27: 348-364Crossref PubMed Scopus (251) Google Scholar Although these strategies appear to be different, the fundamental principle is to use the evolutionary history of haplotypes to reduce the parameter space from individual haplotypes to haplotypes with similar ancestry. However, although the approaches of haplotype clustering and haplotype sharing enjoy a fair amount of power gain, empirical studies found that the smoothing approach may exhibit only similar or less power than the standard methods that regress trait values on haplotypes and impose no assumptions on haplotypes, even when there are many haplotypes.14Schaid DJ Evaluating associations of haplotypes with traits.Genet Epidemiol. 2004; 27: 348-364Crossref PubMed Scopus (251) Google Scholar In haplotype smoothing, a dependence structure is introduced to the effects of different haplotypes, according to the similarity between haplotypes, under a Bayesian hierarchical model or a mixed-model framework, and the overall gene-trait association can be studied via the variance components (VC).12Thomas DC Morrison JL Clayton DG Bayes estimates of haplotype effects.Genet Epidemiol. 2001; 21: S712-S717PubMed Google Scholar, 13Molitor J Marjoram P Thomas D Application of Bayesian spatial statistical methods to analysis of haplotypes effects and gene mapping.Genet Epidemiol. 2003; 25: 95-105Crossref PubMed Scopus (33) Google Scholar, 14Schaid DJ Evaluating associations of haplotypes with traits.Genet Epidemiol. 2004; 27: 348-364Crossref PubMed Scopus (251) Google Scholar The idea of correlating haplotype effect is based on the assumption that the present mutation-bearing haplotypes have descended from a small number of ancestral haplotypes, and, as a result, the disease haplotypes tend to be correlated because of this shared ancestry. Without losing generality, in this work, we refer to these methods as “VC” approaches and discuss them under a mixed-model framework. We also refer to the standard haplotype regression method as a “fixed-effect” approach. Schaid14Schaid DJ Evaluating associations of haplotypes with traits.Genet Epidemiol. 2004; 27: 348-364Crossref PubMed Scopus (251) Google Scholar first noted the underpowered phenomenon of the VC method, using the likelihood-ratio test (LRT), and explored potential reasons based on the noncentrality (NC) parameter of the distribution of the LRT statistics. The NC parameter reflects the distance between the alternative distribution and the null distribution of the test statistics, and the larger the null-to-alternative distance is, the higher the power a test possesses. By expressing the NC parameter as a function of heritability (h2), it can be seen that, although the NC parameter of a fixed-effect model is proportional to h2/1-h2, the NC parameter of a VC model is much smaller (proportional to h4). As a result, the power gain brought by the low degrees of freedom can be compromised with the small NC parameter in a VC-LRT approach. Here, we report other key factors that contribute to this underpowered phenomenon. In brief, unlike the usual VC model in which the VC represents the potential variability from a source that is independently distributed in the population (e.g., the family effect in the study of linkage or familial aggregation), in the population-based haplotype analysis, the source of variability is not independent. That is, the design matrix of the random haplotype effect does not have a diagonal or block-diagonal structure. Furthermore, the dimension of the random haplotype effect is fixed. Therefore, the data under the alternative hypothesis cannot be represented as a collection of independent data vectors. As a result, the distribution of the LRT statistic does not converge to the conventional 50:50 mixture of χ20 and χ21 (i.e., the limiting distribution predicted by the usual asymptotic theory15Self SG Liang KY Asymptotic properties of maximum likelihood estimators and likelihood ratio tests under nonstandard conditions.J Am Stat Assoc. 1987; 82: 605-610Crossref Scopus (1780) Google Scholar). Instead, empirical evidence indicates that the distribution of VC-LRT statistics has higher weighting of χ20. Hence, the threshold value obtained from the 50:50 χ2 mixture is overstringent and causes a too-conservative testing result. Such overconservative findings of the LRT was obtained also by Crainiceanu and Ruppert16Crainiceanu CM Ruppert D Likelihood ratio tests in linear mixed models with one variance component.J R Statist Soc B. 2004; 66: 165-185Crossref Scopus (265) Google Scholar in certain linear mixed models. To overcome the problem of a lack of independence and also to generalize the VC approach to all types of trait values, we propose a score test under the generalized linear mixed-model (GLMM) framework. Specifically, we construct a score statistic based on the restricted maximum likelihood (REML) or the marginal likelihood function of the VC and identify its nontypical asymptotic distribution. The proposed test is easy to implement and computationally efficient yet is general enough to accommodate a broad class of phenotypes and correlation structures. It allows for covariate information and can be used for phase-unknown genotypic data. Through simulation, we demonstrate the validity of the test and investigate the power performance of the VC approach and the fixed-effect approach under general scenarios. We also apply the proposed method to a case-control data set from a genomewide association study of amyotrophic lateral sclerosis (ALS) conducted by Schymick et al.17Schymick JC Scholz SW Fung HC Britton A Arepalli S Gibbs JR Lombardo F Matarin M Kasperaviciute D Hernandez DG et al.Genome-wide genotyping in amyotrophic lateral sclerosis and neurologically normal controls: first stage analysis and public release of data.Lancet Neurol. 2007; 6: 322-328Abstract Full Text Full Text PDF PubMed Scopus (179) Google Scholar In the analysis, we test for gene-trait association on chromosome 10 with the 275 ALS cases and 271 controls and examine statistical significance at the genomewide level. We verify the findings from the proposed method by comparing them with the results reported by Schymick et al.17Schymick JC Scholz SW Fung HC Britton A Arepalli S Gibbs JR Lombardo F Matarin M Kasperaviciute D Hernandez DG et al.Genome-wide genotyping in amyotrophic lateral sclerosis and neurologically normal controls: first stage analysis and public release of data.Lancet Neurol. 2007; 6: 322-328Abstract Full Text Full Text PDF PubMed Scopus (179) Google Scholar We denote the data with the following notations. For individual i (i=1,2,…,n), we have trait value Yi, environmental covariates Xi (a K×1 vector including the intercept term), and haplotype Hi (an L×1 vector, where L is the number of distinct haplotypes observed in the population). Vector Hi records individual i’s haplotype pair via a certain scoring rule, such as by setting its hth element as the number of haplotype h that individual i carries. Throughout this article, we treat explanatory variables (e.g., Xi and Hi) as constants and will omit them in the lists of the conditional variables. This means that, for example, we will use Var (Yi) instead of Var(Yi|Xi,Hi). Assume that the trait value Yi follows some distribution with conditional mean E(Yi|β)=βi and conditional variance Var Yi|β=m−1iϕv βi, where mi is a known prior weight (e.g., binomial denominator), ϕ is the dispersion parameter (e.g., measurement-error variance for a normal quantitative trait), and v βi is the variance function. Then, the VC model can be expressed under the framework of GLMM asg(μi)=XiTγ+HiTββ~MN(0,τRβ) ,(1) where g(·) is a link function that connects the conditional mean βi and the explanatory variables, γK×1 represents the fixed effect of environmental covariates, and βL×1 is the random effect of haplotypes. The haplotype effect is assumed to have a multivariate normal (MN) prior. With model (1), the marginal phenotypic variance, Var (Yi), can be partitioned into genetic components and environment components, and the association between haplotypes and traits can be detected by testing for zero genetic VC (i.e., τ=0). Intuitively, τ=0 implies that all βh share the same value, and this is essentially the null hypothesis of the standard fixed-effect approaches. The correlation structure of βh is specified through the L×L matrix Rβ. Here, we consider a general formulation for Rβ by letting its (h,k) element, denoted by rhk, depend on the similarity level between haplotypes h and k, which is quantified by a certain similarity metric, s(h,k). One simple choice of the correlation structure is to let Rβ=I, where I is the identity matrix. This independence structure imposes no correlation among distinct haplotypes and reflects the “unstructured” variation among haplotypes. The independence prior may be reasonable if haplotype variants were created mainly by recombinations instead of mutations. In contrast, one can introduce local-dependence structures to account for the role of mutation and to reflect the conjecture that evolutionarily close haplotypes tend to have similar effects on traits. One convenient choice of such Rβ is the conditional autoregressive (CAR) structure. The CAR structure assumes that all βh are correlated but that the correlation diminishes as the haplotype similarity decays. With our representation, a CAR structure is to let Rβ=C, where C−1 has diagonal elements equal to 1 and off-diagonal elements equal to -s(h,k).18Carlin B Louis T Bayes and empirical Bayes methods for data analysis. 2nd ed. Chapman & Hall, New York2000: 262Google Scholar Alternatively, to avoid choosing between an independence prior and a sole CAR prior, an intermediate option, in practice, is the convolution model that combines the two: τRβ=τ1I+τ2C.12Thomas DC Morrison JL Clayton DG Bayes estimates of haplotype effects.Genet Epidemiol. 2001; 21: S712-S717PubMed Google Scholar, 13Molitor J Marjoram P Thomas D Application of Bayesian spatial statistical methods to analysis of haplotypes effects and gene mapping.Genet Epidemiol. 2003; 25: 95-105Crossref PubMed Scopus (33) Google Scholar In this work, we focus on the model that was considered by Schaid14Schaid DJ Evaluating associations of haplotypes with traits.Genet Epidemiol. 2004; 27: 348-364Crossref PubMed Scopus (251) Google Scholar and set rhk=s(h,k), with 0≤s(h,k)≤1. This model uses the haplotype similarity to reflect the correlation directly. It is more extreme but uses a simpler concept than the convolution model, by compromising between the dependence and the independence priors. It allows for correlation induced from partially similar haplotypes but assumes independence among haplotypes that share zero similarity. To motivate our VC-score test for haplotype-phenotype association, we illustrate the method, assuming a normally distributed trait (perhaps after some transformation, such as the logarithm transformation) with a known dispersion parameter, ϕ. We then present the VC-score test for general scenarios of unknown ϕ and trait values with an arbitrary distribution. We provide the derivation of the generalization in appendixes A and B. For quantitative traits that follow a normal distribution directly or after appropriate transformations, model (1) reduces to a linear mixed-model in matrix notation:Y=Xγ+Hβ+ε ,(2) where X is the design matrix for γ, whose ith row is XTi; H is the design matrix for β, whose ith row is HTi; β∼MN(0,τRβ) is the same as described in model (1); and ε∼N(0,ϕI) represents the uncertainty in measuring traits Y. Since our primary interest is to test H0:τ=0, we consider the REML log-likelihood function of VC (τ,ϕ). It is well known that the REML estimating equation for (τ,ϕ) is unbiased and will produce less biased estimates compared with the maximum-likelihood approach.19Searle S Casella G McCulloch C Variance components. Wiley, New York1992: 232-257Crossref Google Scholar Denote by ℓREML(τ,ϕ;Y) the REML log-likelihood function of τ and ϕ, which is given byℓREML(τ,φ;Y)=-12log|V|-12log|XTV-1X|-12YTPY ,(3) where V=τHRβHT+ϕI≡τS+ϕI is the marginal variance of Y and where P=V−1-V−1X(XTV−1X)−1XTV−1 is the projection matrix for the linear mixed model (2). The REML log-likelihood function (3) can also be viewed as the marginal log-likelihood of (τ,ϕ) from the Bayesian perspective obtained by specifying a flat prior for γ and integrating out γ from f(Y;γ,τ,ϕ). Simple algebra20Harville D Maximum likelihood approaches to variance component estimation and related problems.J Am Stat Assoc. 1977; 72: 322-340Google Scholar shows that the score statistic of τ evaluated under H0 on the basis of the REML function (3) is equal toUτ=∂lREML(τ,ϕ)∂τ|τ=0=12{YTP0SP0Y-tr(P0S)},(4) where P0=ϕ−1{I-X(XTX)−1XT}=ϕ−1Q is the projection matrix P evaluated under H0:τ=0 and where Q=I-X(XTX)−1XT. It is immediately seen from equation (4) that E(Uτ)=0 under H0:τ=0, and, when τ>0, E(Uτ)=τ·tr(QSQS)/(2ϕ2), which is a strictly increasing function of τ unless QS=0. Therefore, larger values of Uτ provide stronger evidence against H0. This suggests that the testing procedure for H0:τ=0 using Uτ should be one sided. In a situation where the VC τ represents the potential variability due to a source that is independently distributed in the population such as the subject-specific effects in a longitudinal study, the score statistic Uτ given in equation (4) under H0:τ=0 has an asymptotic normal distribution with zero mean and some variance when the number of independent clusters goes to infinity.21Lin X Variance component testing in generalized linear models with random effects.Biometrika. 1997; 84: 309-326Crossref Scopus (206) Google Scholar However, this condition does not satisfy in our case. In model (1), the design matrix H for the random effects β is not block diagonal and the dimension of β is fixed. Hence, the Lin’s21Lin X Variance component testing in generalized linear models with random effects.Biometrika. 1997; 84: 309-326Crossref Scopus (206) Google Scholar asymptotic result does not directly apply to Uτ. Since ϕ is known, the second term in Uτ is a constant. Therefore using the score statistic Uτ is equivalent to using the first term of Uτ (denoted by Tτ):Tτ=12YTP0SP0Y=12φ2YTQSQY .(5) We show in appendix A that Tτ has the same distribution as the weighted χ2 random variables Σci=1 λiχ21,i, where χ21,i’s are independent χ2 random variables with 1 df, and λi is the ordered non-zero eigenvalues of the semipositive definite matrix QSQ/(2ϕ) with λ1⩾λ2⩾·⩾λc>0 (c≤L). If the (1-α)th quantile of this weighted χ2 distribution is denoted by T(α), then a level α score test will reject H0 if Tτ⩾T(α). Here, we present the VC-score test for the general case in which the traits may not be normally distributed and the dispersion parameter ϕ may or may not be known. As indicated by the derivation given in appendix B, our test statistic can be defined asTτ=12(Y-μ)TΔWSWΔ(Y-μ)|τ=0,ϕ=ϕˆ,γ=γˆ,(6) where μ=g−1(Xγ), Δ=diag{g′(βi)},γ^ is the maximum-likelihood estimate of γ under H0, andϕ^ is the REML type of estimate (such as the one that uses Pearson residuals) of ϕ under H0. Matrix W=diag{wi}, with wi={ϕm−1iv(βi)[g′(βi)]2}−1. These quantities are readily available by fitting a standard generalized linear model, g(β)=Xγ. We derive in appendix B that Tτ also follows approximately the weighted χ2 distribution Σci=1λiχ21,i, where λ1⩾λ2⩾·⩾λc>0 (c≤L) is the nonzero eigenvalues of matrix W−1/2P0SP0W−1/2/2. We note that the conclusions given in the previous section are a special case of the results given here. For normally distributed traits, Δ=I, and W=V−1, which equals ϕ−1I under H0. Hence, equation (6) reduces to equation (5), and the matrix W−1/2P0SP0W−1/2/2 reduces to QSQ/(2ϕ). Given the fact that Tτ follows a weighted χ2 distribution, one can obtain the significance threshold Tα at level α from simulation. However, such a task may not be trivial when α is small. As an alternative, we introduce a Gamma approximation of the distribution of Tτ. Empirical evidence indicates that the eigenvalues λ1,λ2,·,λc of the matrix W−1/2P0SP0W−1/2/2 are dominated by the first few ones and decay rapidly to 0 (fig. 1). Following the work of Zhang and Lin,22Zhang D Lin X Hypothesis testing in semiparametric additive mixed models.Biostatistics. 2003; 4: 57-74Crossref PubMed Scopus (94) Google Scholar we use the Satterthwaite method to approximate the null distribution of Tτ by a Gamma distribution with parameters (a,b). Let E and V denote the mean and variance of Tτ, respectively. We match the mean and the variance of the Gamma distribution and those of the test statistic by setting ab=E and ab2=V, and we get a=E2/V and b=V/E. We can then obtain T(α) or calculate the P value of the test statistic from the distribution of Gamma (a,b). The mean, E, and variance, V, of Tτ can be calculated (appendixes A and B) byE∧=12tr(P0S)andV∧=I∧ττ-I∧τφ2/I∧φφ ,whereVˆ=Iˆττ-Iˆτϕ2/Iˆϕϕ,andI∧φφ=12φ∧2tr(P0W-1)=n-K2φ∧2 . Although we have described our test, assuming that the haplotype information H is observed, the phase information can be not crucial. From equations (5) and (6), we see that the haplotype information appears in Tτ only through S=HRβHT, whose (i,j) element, denoted by Sij, can be rewritten asSij=HiTRβHj=∑h,kHi,hHj,k×s(h,k) .The right-hand side of the equation states that Sij is simply the similarity score between the haplotype pair of person i and that of person j measured by metric s(h,k). As a result, by choosing those metrics that do not require phase information, we can calculate S without resorting to the phased data. One choice is to set s(h,k) as the proportion of matching alleles between two haplotypes, h and k. As demonstrated by Tzeng et al.11Tzeng JY Devlin B Wasserman L Roeder K On the identification of disease mutations by the analysis of haplotype similarity and goodness of fit.Am J Hum Genet. 2003; 72: 891-902Abstract Full Text Full Text PDF PubMed Scopus (111) Google Scholar and Schaid,14Schaid DJ Evaluating associations of haplotypes with traits.Genet Epidemiol. 2004; 27: 348-364Crossref PubMed Scopus (251) Google Scholar such Sij is equivalent to the proportion of matching alleles between the genotypes of individual i and individual j and hence can be calculated directly from genotypes with unknown phase. We conduct simulation studies to examine the performance of the proposed score test. In the simulation, we generated covariates Xi, haplotypes Hi, and trait values Yi, given Xi and Hi, for each individual. The covariate Xi is drawn from a standard normal distribution, and the haplotype Hi is generated using a technique similar to those reported by Roeder et al.23Roeder K Bacanu SA Sonpar V Zhang X Devlin B Analysis of single-locus tests to detect gene-disease associations.Genet Epidemiol. 2005; 28: 207-219Crossref PubMed Scopus (81) Google Scholar and Tzeng et al.8Tzeng JY Wang CH Kao JT Hsiao CK Regression-based association analysis with clustered haplotypes through use of genotypes.Am J Hum Genet. 2006; 78: 231-242Abstract Full Text Full Text PDF PubMed Scopus (69) Google Scholar Specifically, we simulated 100 haplotypes under the coalescent model,24Wall JD Pritchard JK Assessing the performance of the haplotype block model of linkage disequilibrium.Am J Hum Genet. 2003; 73: 502-515Abstract Full Text Full Text PDF PubMed Scopus (122) Google Scholar with an effective population size of 104, a scaled mutation rate of 5.6×10−4 per bp, and a scaled recombination rate of ∼6×10−3 per bp for the cold spots and a rate 45 times greater for the hotspots. These parameters are chosen to roughly match the genes observed in the SeattleSNP database. We discarded SNPs with minor-allele frequencies <0.05. The hypothetical disease locus is selected on the basis of a predetermined minor-allele frequency, q, and the diversity of haplotypes flanking the SNP. In the simulation, we considered q=0.1, 0.3, and 0.5 and haplotype-diversity levels of high (11–16 distinct haplotypes), moderate (9–11 distinct haplotypes), and low (6–9 distinct haplotypes). We set a haplotype region to be a segment of five adjacent SNPs, including the two SNPs on the left and the three SNPs on the right of the disease locus. Given that the disease SNP is excluded, we also considered whether the disease SNP is “tagged” or “not tagged” by the surrounding five SNPs under each scenario. We defined that the disease SNP is “tagged” if there is at least one SNP whose R2 with the disease SNP is >0.7, and it is “not tagged” otherwise. We then randomly sampled with replacement of 2 haplotypes from the 100 haplotypes to form an individual. The simulated haplotype data were then converted into unphased genotype data. We next generated the trait values Yi on the basis of Xi and the genotypes at the disease locus. We determined the trait value of individual i according to Xi and the number of disease alleles (Gi), using an additive-effect model. In the simulation study, we considered both quantitative traits and binary traits and adopted the same trait-generating scheme as did Lake et al.25Lake SL Lyon H Tantisira K Silverman EK Weiss ST Laird NM Schaid DJ Estimation and tests of haplotype-environment interaction when linkage phase is ambiguous.Hum Hered. 2003; 55: 56-65Crossref PubMed Scopus (385) Google Scholar and Tzeng et al.8Tzeng JY Wang CH Kao JT Hsiao CK Regression-based association analysis with clustered haplotypes through use of genotypes.Am J Hum Genet. 2006; 78: 231-242Abstract Full Text Full Text PDF PubMed Scopus (69) Google Scholar For quantitative traits, we used a random-sampling scheme and generated 200 trait values from the normal conditional distribution of Yi with mean γ0+γ1×Xi+(Gi-1) and variance 2q(1-q)×(1-h2)/h2. We set the heritability (h2) at 0.1 and γ0=γ1=1. For binary traits, we used a case-control sampling scheme and generated trait values of 0 or 1, using the penetrance function logitP(Y=1|Gi,Xi)=γ0+γ1×Xi+θ×Gi. We set the odds ratio (OR) (eθ) at 2.0 and set the disease prevalence at 0.01 by letting γ0=-4.5 and γ1=0. We repeated the process until we collected 100 cases and 100 controls. We analyzed these simulated data to evaluate the power performance of the VC-score method. To compare, we also conducted haplotype analyses, using the fixed-effect method and, in addition, the VC method via regular LRT (VC-LRT) under some scenarios. These analyses were performed assuming unknown phases. For fixed-effect analysis, we use}, number={5}, journal={AMERICAN JOURNAL OF HUMAN GENETICS}, author={Tzeng, Jung-Ying and Zhang, Daowen}, year={2007}, month={Nov}, pages={927–938} } @article{li_zhang_davidian_2007, title={Likelihood and pseudo-likelihood methods for semiparametric joint models for a primary endpoint and longitudinal data}, volume={51}, ISSN={0167-9473}, url={http://dx.doi.org/10.1016/j.csda.2006.10.008}, DOI={10.1016/j.csda.2006.10.008}, abstractNote={Inference on the association between a primary endpoint and features of longitudinal profiles of a continuous response is of central interest in medical and public health research. Joint models that represent the association through shared dependence of the primary and longitudinal data on random effects are increasingly popular; however, existing inferential methods may be inefficient or sensitive to assumptions on the random effects distribution. We consider a semiparametric joint model that makes only mild assumptions on this distribution and develop likelihood-based inference on the association and distribution, which offers improved performance relative to existing methods that is insensitive to the true random effects distribution. Moreover, the estimated distribution can reveal interesting population features, as we demonstrate for a study of the association between longitudinal hormone levels and bone status in peri-menopausal women.}, number={12}, journal={Computational Statistics & Data Analysis}, publisher={Elsevier BV}, author={Li, Erning and Zhang, Daowen and Davidian, Marie}, year={2007}, month={Aug}, pages={5776–5790} } @article{zhang_lin_sowers_2007, title={Two-stage functional mixed models for evaluating the effect of longitudinal covariate profiles on a scalar outcome}, volume={63}, ISSN={["1541-0420"]}, DOI={10.1111/j.1541-0420.2006.00713.x}, abstractNote={Summary The Daily Hormone Study, a substudy of the Study of Women's Health Across the Nation (SWAN) consisting of more than 600 pre- and perimenopausal women, includes a scalar measure of total hip bone mineral density (BMD) together with repeated measures of creatinine-adjusted follicle stimulating hormone (FSH) assayed from daily urine samples collected over one menstrual cycle. It is of scientific interest to investigate the effect of the FSH time profile during a menstrual cycle on total hip BMD, adjusting for age and body mass index. The statistical analysis is challenged by several features of the data: (1) the covariate FSH is measured longitudinally and its effect on the scalar outcome BMD may be complex; (2) due to varying menstrual cycle lengths, subjects have unbalanced longitudinal measures of FSH; and (3) the longitudinal measures of FSH are subject to considerable among- and within-subject variations and measurement errors. We propose a measurement error partial functional linear model, where repeated measures of FSH are modeled using a functional mixed effects model and the effect of the FSH time profile on BMD is modeled using a partial functional linear model by treating the unobserved true subject-specific FSH time profile as a functional covariate. We develop a two-stage nonparametric regression calibration method using period smoothing splines. Using the connection between smoothing splines and mixed models, we show that a key feature of our approach is that estimation at both stages can be conveniently cast into a unified mixed model framework. A simple testing procedure for constant functional covariate effect is also proposed. The proposed methods are evaluated using simulation studies and applied to the SWAN data.}, number={2}, journal={BIOMETRICS}, author={Zhang, Daowen and Lin, Xihong and Sowers, MaryFran}, year={2007}, month={Jun}, pages={351–362} } @article{lin_zhang_davidian_2006, title={Smoothing spline-based score tests for proportional hazards models}, volume={62}, ISSN={["0006-341X"]}, DOI={10.1111/j.1541-0420.2005.00521.x}, abstractNote={Summary We propose “score-type” tests for the proportional hazards assumption and for covariate effects in the Cox model using the natural smoothing spline representation of the corresponding nonparametric functions of time or covariate. The tests are based on the penalized partial likelihood and are derived by viewing the inverse of the smoothing parameter as a variance component and testing an equivalent null hypothesis that the variance component is zero. We show that the tests have a size close to the nominal level and good power against general alternatives, and we apply them to data from a cancer clinical trial.}, number={3}, journal={BIOMETRICS}, author={Lin, Jiang and Zhang, Daowen and Davidian, Marie}, year={2006}, month={Sep}, pages={803–812} } @article{sowers_crutchfield_richards_wilkin_furniss_jannausch_zhang_gross_2005, title={Sarcopenia is related to physical functioning and leg strength in middle-aged women}, volume={60}, ISSN={["1758-535X"]}, DOI={10.1093/gerona/60.4.486}, abstractNote={Background. In the aging process, loss of muscle is relatively continuous, but the initiation, timing, and amount of muscle loss that relate to functional compromise are poorly described. Also poorly understood is whether strength and functioning in aging are related to the amount of lean mass and its change as well as to the amount of fat mass and its change. Methods. The purpose of the study was to ascertain whether 3-year lean and fat mass change predicted functional status in 712 African American and Caucasian women, aged 34–58 years. Fat and lean mass were assessed with bioelectrical impedance. Lower leg strength (torque) was measured with a portable isometric chair, and two indices of physical functioning, walking velocity and double support (both feet touching the surface while walking), were measured with an instrumented gait mat. Results. Almost 9% of middle-aged women had at least a 6% loss (>2.5 kg) of lean mass over the 3-year observation period. Women who lost at least 2.5 kg of lean mass had slower walking velocity and less leg strength, although women who simultaneously gained more than 2.5 kg of fat mass (at least 7.5%) did not have less leg strength. Age was significantly associated with less velocity, less leg strength, and more time in double support. Conclusions. Even in middle-aged women, there is loss of lean mass among almost 1 woman in 10, and this loss of lean mass (sarcopenia) is associated with greater compromise in physical functioning.}, number={4}, journal={JOURNALS OF GERONTOLOGY SERIES A-BIOLOGICAL SCIENCES AND MEDICAL SCIENCES}, author={Sowers, MR and Crutchfield, M and Richards, K and Wilkin, MK and Furniss, A and Jannausch, M and Zhang, DW and Gross, M}, year={2005}, month={Apr}, pages={486–490} } @article{li_zhang_davidian_2004, title={Conditional estimation for generalized linear models when covariates are subject-specific parameters in a mixed model for longitudinal measurements}, volume={60}, number={1}, journal={Biometrics}, author={Li, E. N. and Zhang, D. W. and Davidian, M.}, year={2004}, pages={07-} } @article{zhang_choi_wanamaker_fenton_chin_malatrasi_turuspekov_walia_akhunov_kianian_et al._2004, title={Construction and evaluation of cDNA libraries for large-scale expressed sequence tag sequencing in wheat (Triticum aestivum L.)}, volume={168}, ISSN={["1943-2631"]}, DOI={10.1534/genetics.104.034785}, abstractNote={Abstract A total of 37 original cDNA libraries and 9 derivative libraries enriched for rare sequences were produced from Chinese Spring wheat (Triticum aestivum L.), five other hexaploid wheat genotypes (Cheyenne, Brevor, TAM W101, BH1146, Butte 86), tetraploid durum wheat (T. turgidum L.), diploid wheat (T. monococcum L.), and two other diploid members of the grass tribe Triticeae (Aegilops speltoides Tausch and Secale cereale L.). The emphasis in the choice of plant materials for library construction was reproductive development subjected to environmental factors that ultimately affect grain quality and yield, but roots and other tissues were also included. Partial cDNA expressed sequence tags (ESTs) were examined by various measures to assess the quality of these libraries. All ESTs were processed to remove cloning system sequences and contaminants and then assembled using CAP3. Following these processing steps, this assembly yielded 101,107 sequences derived from 89,043 clones, which defined 16,740 contigs and 33,213 singletons, a total of 49,953 “unigenes.” Analysis of the distribution of these unigenes among the libraries led to the conclusion that the enrichment methods were effective in reducing the most abundant unigenes and to the observation that the most diverse libraries were from tissues exposed to environmental stresses including heat, drought, salinity, or low temperature.}, number={2}, journal={GENETICS}, author={Zhang, D and Choi, DW and Wanamaker, S and Fenton, RD and Chin, A and Malatrasi, M and Turuspekov, Y and Walia, H and Akhunov, ED and Kianian, P and et al.}, year={2004}, month={Oct}, pages={595–608} } @article{zhang_2004, title={Generalized linear mixed models with varying coefficients for longitudinal data}, volume={60}, number={1}, journal={Biometrics}, author={Zhang, D. W.}, year={2004}, pages={15-} } @article{zhang_davidian_2004, title={Likelihood and conditional likelihood inference for generalized additive mixed models for clustered data}, volume={91}, ISSN={["0047-259X"]}, DOI={10.1016/j.jmva.2004.04.007}, abstractNote={Lin and Zhang (J. Roy. Statist. Soc. Ser. B 61 (1999) 381) proposed the generalized additive mixed model (GAMM) as a framework for analysis of correlated data, where normally distributed random effects are used to account for correlation in the data, and proposed to use double penalized quasi-likelihood (DPQL) to estimate the nonparametric functions in the model and marginal likelihood to estimate the smoothing parameters and variance components simultaneously. However, the normal distributional assumption for the random effects may not be realistic in many applications, and it is unclear how violation of this assumption affects ensuing inferences for GAMMs. For a particular class of GAMMs, we propose a conditional estimation procedure built on a conditional likelihood for the response given a sufficient statistic for the random effect, treating the random effect as a nuisance parameter, which thus should be robust to its distribution. In extensive simulation studies, we assess performance of this estimator under a range of conditions and use it as a basis for comparison to DPQL to evaluate the impact of violation of the normality assumption. The procedure is illustrated with application to data from the Multicenter AIDS Cohort Study (MACS).}, number={1}, journal={JOURNAL OF MULTIVARIATE ANALYSIS}, author={Zhang, DW and Davidian, M}, year={2004}, month={Oct}, pages={90–106} } @article{zhang_lin_2003, title={Hypothesis testing in semiparametric additive mixed models}, volume={4}, number={1}, journal={Biostatistics (Oxford, England)}, author={Zhang, D. W. and Lin, X. H.}, year={2003}, pages={57–74} } @article{chen_zhang_davidian_2002, title={A Monte Carlo EM algorithm for generalized linear mixed models with flexible random effects distribution}, volume={3}, number={3}, journal={Biostatistics (Oxford, England)}, author={Chen, J. L. and Zhang, D. W. and Davidian, M.}, year={2002}, pages={347–360} } @article{randomized comparison of platelet inhibition with abciximab, tirofiban and eptifibatide during percutaneous coronary intervention in acute coronary syndromes - the compare trial_2002, volume={106}, number={12}, journal={Circulation (New York, N.Y. : 1950)}, year={2002}, pages={1470–1476} } @article{zhang_davidian_2001, title={Linear mixed models with flexible distributions of random effects for longitudinal data}, volume={57}, ISSN={["0006-341X"]}, DOI={10.1111/j.0006-341X.2001.00795.x}, abstractNote={Normality of random effects is a routine assumption for the linear mixed model, but it may be unrealistic, obscuring important features of among-individual variation. We relax this assumption by approximating the random effects density by the seminonparameteric (SNP) representation of Gallant and Nychka (1987, Econometrics 55, 363-390), which includes normality as a special case and provides flexibility in capturing a broad range of nonnormal behavior, controlled by a user-chosen tuning parameter. An advantage is that the marginal likelihood may be expressed in closed form, so inference may be carried out using standard optimization techniques. We demonstrate that standard information criteria may be used to choose the tuning parameter and detect departures from normality, and we illustrate the approach via simulation and using longitudinal data from the Framingham study.}, number={3}, journal={BIOMETRICS}, author={Zhang, DW and Davidian, M}, year={2001}, month={Sep}, pages={795–802} } @article{lin_zhang_2001, title={Semiparametric nonlinear mixed-effects models and their applications - Comment}, volume={96}, number={456}, journal={Journal of the American Statistical Association}, author={Lin, X. H. and Zhang, D. W.}, year={2001}, pages={1288–1291} } @article{lin_ryan_sammel_zhang_padungtod_xu_2000, title={A scaled linear mixed model for multiple outcomes}, volume={56}, DOI={10.1111/j.0006-341X.2000.00593.x}, abstractNote={We propose a scaled linear mixed model to assess the effects of exposure and other covariates on multiple continuous outcomes. The most general form of the model allows a different exposure effect for each outcome. An important special case is a model that represents the exposure effects using a common global measure that can be characterized in terms of effect sizes. Correlations among different outcomes within the same subject are accommodated using random effects. We develop two approaches to model fitting, including the maximum likelihood method and the working parameter method. A key feature of both methods is that they can be easily implemented by repeatedly calling software for fitting standard linear mixed models, e.g., SAS PROC MIXED. Compared to the maximum likelihood method, the working parameter method is easier to implement and yields fully efficient estimators of the parameters of interest. We illustrate the proposed methods by analyzing data from a study of the effects of occupational pesticide exposure on semen quality in a cohort of Chinese men.}, number={2}, journal={Biometrics}, author={Lin, X. H. and Ryan, L. and Sammel, M. and Zhang, D. W. and Padungtod, C. and Xu, X. P.}, year={2000}, pages={593–601} } @article{zhang_lin_sowers_2000, title={Semiparametric regression for periodic longitudinal hormone data from multiple menstrual cycles}, volume={56}, ISSN={["0006-341X"]}, DOI={10.1111/j.0006-341X.2000.00031.x}, abstractNote={We consider semiparametric regression for periodic longitudinal data. Parametric fixed effects are used to model the covariate effects and a periodic nonparametric smooth function is used to model the time effect. The within-subject correlation is modeled using subject-specific random effects and a random stochastic process with a periodic variance function. We use maximum penalized likelihood to estimate the regression coefficients and the periodic nonparametric time function, whose estimator is shown to be a periodic cubic smoothing spline. We use restricted maximum likelihood to simultaneously estimate the smoothing parameter and the variance components. We show that all model parameters can be easily obtained by fitting a linear mixed model. A common problem in the analysis of longitudinal data is to compare the time profiles of two groups, e.g., between treatment and placebo. We develop a scaled chi-squared test for the equality of two nonparametric time functions. The proposed model and the test are illustrated by analyzing hormone data collected during two consecutive menstrual cycles and their performance is evaluated through simulations.}, number={1}, journal={BIOMETRICS}, author={Zhang, DW and Lin, XH and Sowers, MF}, year={2000}, month={Mar}, pages={31–39} } @article{lin_zhang_1999, title={Inference in generalized additive mixed models by using smoothing splines}, volume={61}, number={1999}, journal={Journal of the Royal Statistical Society. Series B, Methodological}, author={Lin, X. H. and Zhang, D. W.}, year={1999}, pages={381–400} }