@article{clark_fu_liu_ho_wang_lee_chou_wang_tzeng_2023, title={The prediction of Alzheimer's disease through multi-trait genetic modeling}, volume={15}, ISSN={["1663-4365"]}, DOI={10.3389/fnagi.2023.1168638}, abstractNote={To better capture the polygenic architecture of Alzheimer’s disease (AD), we developed a joint genetic score, MetaGRS. We incorporated genetic variants for AD and 24 other traits from two independent cohorts, NACC (n = 3,174, training set) and UPitt (n = 2,053, validation set). One standard deviation increase in the MetaGRS is associated with about 57% increase in the AD risk [hazard ratio (HR) = 1.577, p = 7.17 E-56], showing little difference from the HR for AD GRS alone (HR = 1.579, p = 1.20E-56), suggesting similar utility of both models. We also conducted APOE-stratified analyses to assess the role of the e4 allele on risk prediction. Similar to that of the combined model, our stratified results did not show a considerable improvement of the MetaGRS. Our study showed that the prediction power of the MetaGRS significantly outperformed that of the reference model without any genetic information, but was effectively equivalent to the prediction power of the AD GRS.}, journal={FRONTIERS IN AGING NEUROSCIENCE}, author={Clark, Kaylyn and Fu, Wei and Liu, Chia-Lun and Ho, Pei-Chuan and Wang, Hui and Lee, Wan-Ping and Chou, Shin-Yi and Wang, Li-San and Tzeng, Jung-Ying}, year={2023}, month={Jul} } @article{jeng_hu_venkat_lu_tzeng_2023, title={Transfer learning with false negative control improves polygenic risk prediction}, volume={19}, ISSN={1553-7404}, url={http://dx.doi.org/10.1371/journal.pgen.1010597}, DOI={10.1371/journal.pgen.1010597}, abstractNote={Polygenic risk score (PRS) is a quantity that aggregates the effects of variants across the genome and estimates an individual’s genetic predisposition for a given trait. PRS analysis typically contains two input data sets: base data for effect size estimation and target data for individual-level prediction. Given the availability of large-scale base data, it becomes more common that the ancestral background of base and target data do not perfectly match. In this paper, we treat the GWAS summary information obtained in the base data as knowledge learned from a pre-trained model, and adopt a transfer learning framework to effectively leverage the knowledge learned from the base data that may or may not have similar ancestral background as the target samples to build prediction models for target individuals. Our proposed transfer learning framework consists of two main steps: (1) conducting false negative control (FNC) marginal screening to extract useful knowledge from the base data; and (2) performing joint model training to integrate the knowledge extracted from base data with the target training data for accurate trans-data prediction. This new approach can significantly enhance the computational and statistical efficiency of joint-model training, alleviate over-fitting, and facilitate more accurate trans-data prediction when heterogeneity level between target and base data sets is small or high.}, number={11}, journal={PLOS Genetics}, publisher={Public Library of Science (PLoS)}, author={Jeng, Xinge Jessie and Hu, Yifei and Venkat, Vaishnavi and Lu, Tzu-Pin and Tzeng, Jung-Ying}, editor={Epstein, Michael P.Editor}, year={2023}, month={Nov}, pages={e1010597} } @article{wang_tzeng_huang_maguire_hoyo_allen_2023, title={Duration of exposure to epidural anesthesia at delivery, DNA methylation in umbilical cord blood and their association with offspring asthma in Non-Hispanic Black women}, volume={9}, ISSN={["2058-5888"]}, url={https://doi.org/10.1093/eep/dvac026}, DOI={10.1093/eep/dvac026}, abstractNote={Abstract Epidural anesthesia is an effective pain relief modality, widely used for labor analgesia. Childhood asthma is one of the commonest chronic medical illnesses in the USA which places a significant burden on the health-care system. We recently demonstrated a negative association between the duration of epidural anesthesia and the development of childhood asthma; however, the underlying molecular mechanisms still remain unclear. In this study of 127 mother–child pairs comprised of 75 Non-Hispanic Black (NHB) and 52 Non-Hispanic White (NHW) from the Newborn Epigenetic Study, we tested the hypothesis that umbilical cord blood DNA methylation mediates the association between the duration of exposure to epidural anesthesia at delivery and the development of childhood asthma and whether this differed by race/ethnicity. In the mother–child pairs of NHB ancestry, the duration of exposure to epidural anesthesia was associated with a marginally lower risk of asthma (odds ratio = 0.88, 95% confidence interval = 0.76–1.01) for each 1-h increase in exposure to epidural anesthesia. Of the 20 CpGs in the NHB population showing the strongest mediation effect, 50% demonstrated an average mediation proportion of 52%, with directional consistency of direct and indirect effects. These top 20 CpGs mapped to 21 genes enriched for pathways engaged in antigen processing, antigen presentation, protein ubiquitination and regulatory networks related to the Major Histocompatibility Complex (MHC) class I complex and Nuclear Factor Kappa-B (NFkB) complex. Our findings suggest that DNA methylation in immune-related pathways contributes to the effects of the duration of exposure to epidural anesthesia on childhood asthma risk in NHB offspring.}, number={1}, journal={ENVIRONMENTAL EPIGENETICS}, author={Wang, Yaxu and Tzeng, Jung-Ying and Huang, Yueyang and Maguire, Rachel and Hoyo, Cathrine and Allen, Terrence K.}, year={2023}, month={Jan} } @article{huang_callahan_wu_holloway_brochu_lu_peng_tzeng_2022, title={Phylogeny-guided microbiome OTU-specific association test (POST)}, volume={10}, ISSN={["2049-2618"]}, DOI={10.1186/s40168-022-01266-3}, abstractNote={AbstractBackgroundThe relationship between host conditions and microbiome profiles, typically characterized by operational taxonomic units (OTUs), contains important information about the microbial role in human health. Traditional association testing frameworks are challenged by the high dimensionality and sparsity of typical microbiome profiles. Phylogenetic information is often incorporated to address these challenges with the assumption that evolutionarily similar taxa tend to behave similarly. However, this assumption may not always be valid due to the complex effects of microbes, and phylogenetic information should be incorporated in adata-supervisedfashion.ResultsIn this work, we propose a local collapsing test called phylogeny-guided microbiome OTU-specific association test (POST). In POST, whether or not to borrow information and how much information to borrow from the neighboring OTUs in the phylogenetic tree are supervised by phylogenetic distance and the outcome-OTU association. POST is constructed under the kernel machine framework to accommodate complex OTU effects and extends kernel machine microbiome tests from community level to OTU level. Using simulation studies, we show that when the phylogenetic tree is informative, POST has better performance than existing OTU-level association tests. When the phylogenetic tree is not informative, POST achieves similar performance as existing methods. Finally, in real data applications on bacterial vaginosis and on preterm birth, we find that POST can identify similar or more outcome-associated OTUs that are of biological relevance compared to existing methods.ConclusionsUsing POST, we show that adaptively leveraging the phylogenetic information can enhance the selection performance of associated microbiome features by improving the overall true-positive and false-positive detection. We developed a user friendly R packagePOSTmwhich is freely available on CRAN (https://CRAN.R-project.org/package=POSTm).}, number={1}, journal={MICROBIOME}, author={Huang, Caizhi and Callahan, Benjamin John and Wu, Michael C. and Holloway, Shannon T. and Brochu, Hayden and Lu, Wenbin and Peng, Xinxia and Tzeng, Jung-Ying}, year={2022}, month={Jun} } @article{yu_lu_hsiao_lin_wu_tzeng_hsiao_2021, title={An Integrative Co-localization (INCO) Analysis for SNV and CNV Genomic Features With an Application to Taiwan Biobank Data}, volume={12}, ISSN={["1664-8021"]}, DOI={10.3389/fgene.2021.709555}, abstractNote={Genomic studies have been a major approach to elucidating disease etiology and to exploring potential targets for treatments of many complex diseases. Statistical analyses in these studies often face the challenges of multiplicity, weak signals, and the nature of dependence among genetic markers. This situation becomes even more complicated when multi-omics data are available. To integrate the data from different platforms, various integrative analyses have been adopted, ranging from the direct union or intersection operation on sets derived from different single-platform analysis to complex hierarchical multi-level models. The former ignores the biological relationship between molecules while the latter can be hard to interpret. We propose in this study an integrative approach that combines both single nucleotide variants (SNVs) and copy number variations (CNVs) in the same genomic unit to co-localize the concurrent effect and to deal with the sparsity due to rare variants. This approach is illustrated with simulation studies to evaluate its performance and is applied to low-density lipoprotein cholesterol and triglyceride measurements from Taiwan Biobank. The results show that the proposed method can more effectively detect the collective effect from both SNVs and CNVs compared to traditional methods. For the biobank analysis, the identified genetic regions including the gene VNN2 could be novel and deserve further investigation.}, journal={FRONTIERS IN GENETICS}, author={Yu, Qi-You and Lu, Tzu-Pin and Hsiao, Tzu-Hung and Lin, Ching-Heng and Wu, Chi-Yun and Tzeng, Jung-Ying and Hsiao, Chuhsing Kate}, year={2021}, month={Sep} } @article{chang_yang_lu_huang_huang_hung_miecznikowski_lu_tzeng_2021, title={Gene-set integrative analysis of multi-omics data using tensor-based association test}, volume={37}, ISSN={["1460-2059"]}, DOI={10.1093/bioinformatics/btab125}, abstractNote={Abstract Motivation Facilitated by technological advances and the decrease in costs, it is feasible to gather subject data from several omics platforms. Each platform assesses different molecular events, and the challenge lies in efficiently analyzing these data to discover novel disease genes or mechanisms. A common strategy is to regress the outcomes on all omics variables in a gene set. However, this approach suffers from problems associated with high-dimensional inference. Results We introduce a tensor-based framework for variable-wise inference in multi-omics analysis. By accounting for the matrix structure of an individual’s multi-omics data, the proposed tensor methods incorporate the relationship among omics effects, reduce the number of parameters, and boost the modeling efficiency. We derive the variable-specific tensor test and enhance computational efficiency of tensor modeling. Using simulations and data applications on the Cancer Cell Line Encyclopedia (CCLE), we demonstrate our method performs favorably over baseline methods and will be useful for gaining biological insights in multi-omics analysis. Availability and implementation R function and instruction are available from the authors’ website: https://www4.stat.ncsu.edu/~jytzeng/Software/TR.omics/TRinstruction.pdf. Supplementary information Supplementary data are available at Bioinformatics online. }, number={16}, journal={BIOINFORMATICS}, author={Chang, Sheng-Mao and Yang, Meng and Lu, Wenbin and Huang, Yu-Jyun and Huang, Yueyang and Hung, Hung and Miecznikowski, Jeffrey C. and Lu, Tzu-Pin and Tzeng, Jung-Ying}, year={2021}, month={Aug}, pages={2259–2265} } @article{chi_ipsen_hsiao_lin_wang_lee_lu_tzeng_2021, title={SEAGLE: A Scalable Exact Algorithm for Large-Scale Set-Based Gene-Environment Interaction Tests in Biobank Data}, volume={12}, ISSN={["1664-8021"]}, DOI={10.3389/fgene.2021.710055}, abstractNote={The explosion of biobank data offers unprecedented opportunities for gene-environment interaction (GxE) studies of complex diseases because of the large sample sizes and the rich collection in genetic and non-genetic information. However, the extremely large sample size also introduces new computational challenges in G×E assessment, especially for set-based G×E variance component (VC) tests, which are a widely used strategy to boost overall G×E signals and to evaluate the joint G×E effect of multiple variants from a biologically meaningful unit (e.g., gene). In this work, we focus on continuous traits and present SEAGLE, aScalableExactAlGorithm forLarge-scale set-based G×Etests, to permit G×E VC tests for biobank-scale data. SEAGLE employs modern matrix computations to calculate the test statistic andp-value of the GxE VC test in a computationally efficient fashion, without imposing additional assumptions or relying on approximations. SEAGLE can easily accommodate sample sizes in the order of 105, is implementable on standard laptops, and does not require specialized computing equipment. We demonstrate the performance of SEAGLE using extensive simulations. We illustrate its utility by conducting genome-wide gene-based G×E analysis on the Taiwan Biobank data to explore the interaction of gene and physical activity status on body mass index.}, journal={FRONTIERS IN GENETICS}, author={Chi, Jocelyn T. and Ipsen, Ilse C. F. and Hsiao, Tzu-Hung and Lin, Ching-Heng and Wang, Li-San and Lee, Wan-Ping and Lu, Tzu-Pin and Tzeng, Jung-Ying}, year={2021}, month={Nov} } @article{brucker_lu_west_yu_hsiao_hsiao_lin_magnusson_sullivan_szatkiewicz_et al._2020, title={Association test using Copy Number Profile Curves (CONCUR) enhances power in rare copy number variant analysis}, volume={16}, ISSN={["1553-7358"]}, url={https://doi.org/10.1371/journal.pcbi.1007797}, DOI={10.1371/journal.pcbi.1007797}, abstractNote={Copy number variants (CNVs) are the gain or loss of DNA segments in the genome that can vary in dosage and length. CNVs comprise a large proportion of variation in human genomes and impact health conditions. To detect rare CNV associations, kernel-based methods have been shown to be a powerful tool due to their flexibility in modeling the aggregate CNV effects, their ability to capture effects from different CNV features, and their accommodation of effect heterogeneity. To perform a kernel association test, a CNV locus needs to be defined so that locus-specific effects can be retained during aggregation. However, CNV loci are arbitrarily defined and different locus definitions can lead to different performance depending on the underlying effect patterns. In this work, we develop a new kernel-based test called CONCUR (i.e., copy number profile curve-based association test) that is free from a definition of locus and evaluates CNV-phenotype associations by comparing individuals’ copy number profiles across the genomic regions. CONCUR is built on the proposed concepts of “copy number profile curves” to describe the CNV profile of an individual, and the “common area under the curve (cAUC) kernel” to model the multi-feature CNV effects. The proposed method captures the effects of CNV dosage and length, accounts for the numerical nature of copy numbers, and accommodates between- and within-locus etiological heterogeneity without the need to define artificial CNV loci as required in current kernel methods. In a variety of simulation settings, CONCUR shows comparable or improved power over existing approaches. Real data analyses suggest that CONCUR is well powered to detect CNV effects in the Swedish Schizophrenia Study and the Taiwan Biobank.}, number={5}, journal={PLOS COMPUTATIONAL BIOLOGY}, publisher={Public Library of Science (PLoS)}, author={Brucker, Amanda and Lu, Wenbin and West, Rachel Marceau and Yu, Qi-You and Hsiao, Chuhsing Kate and Hsiao, Tzu-Hung and Lin, Ching-Heng and Magnusson, Patrik K. E. and Sullivan, Patrick F. and Szatkiewicz, Jin P. and et al.}, editor={Ma, JianEditor}, year={2020}, month={May} } @article{jeng_rhyne_zhang_tzeng_2020, title={Effective SNP ranking improves the performance of eQTL mapping}, volume={44}, ISSN={["1098-2272"]}, url={https://doi.org/10.1002/gepi.22293}, DOI={10.1002/gepi.22293}, abstractNote={AbstractGenome‐wide expression quantitative trait loci (eQTLs) mapping explores the relationship between gene expression and DNA variants, such as single‐nucleotide polymorphism (SNPs), to understand genetic basis of human diseases. Due to the large number of genes and SNPs that need to be assessed, current methods for eQTL mapping often suffer from low detection power, especially for identifying trans‐eQTLs. In this paper, we propose the idea of performing SNP ranking based on the higher criticism statistic, a summary statistic developed in large‐scale signal detection. We illustrate how the HC‐based SNP ranking can effectively prioritize eQTL signals over noise, greatly reduce the burden of joint modeling, and improve the power for eQTL mapping. Numerical results in simulation studies demonstrate the superior performance of our method compared to existing methods. The proposed method is also evaluated in HapMap eQTL data analysis and the results are compared to a database of known eQTLs.}, number={6}, journal={GENETIC EPIDEMIOLOGY}, author={Jeng, X. Jessie and Rhyne, Jacob and Zhang, Teng and Tzeng, Jung-Ying}, year={2020}, month={Sep}, pages={611–619} } @article{rhyne_jeng_chi_tzeng_2020, title={FastLORS: Joint modelling for expression quantitative trait loci mapping in R}, volume={9}, ISSN={["2049-1573"]}, url={https://doi.org/10.1002/sta4.265}, DOI={10.1002/sta4.265}, abstractNote={FastLORS is a software package that implements a new algorithm to solve sparse multivariate regression for expression quantitative trait loci (eQTLs) mapping. FastLORS solves the same optimization problem as LORS, an existing popular algorithm. The optimization problem is solved through inexact block coordinate descent with updates by proximal gradient steps, which reduces the computational cost compared with LORS. We apply LORS and FastLORS to a real dataset for eQTL mapping and demonstrate that FastLORS delivers comparable results with LORS in much less computing time.}, number={1}, journal={STAT}, publisher={Wiley}, author={Rhyne, Jacob and Jeng, X. Jessie and Chi, Eric C. and Tzeng, Jung-Ying}, year={2020} } @article{martinez_maity_yolken_sullivan_tzeng_2020, title={Robust kernel association testing (RobKAT)}, volume={44}, ISSN={["1098-2272"]}, url={https://doi.org/10.1002/gepi.22280}, DOI={10.1002/gepi.22280}, abstractNote={AbstractTesting the association between single‐nucleotide polymorphism (SNP) effects and a response is often carried out through kernel machine methods based on least squares, such as the sequence kernel association test (SKAT). However, these least‐squares procedures are designed for a normally distributed conditional response, which may not apply. Other robust procedures such as the quantile regression kernel machine (QRKM) restrict the choice of the loss function and only allow inference on conditional quantiles. We propose a general and robust kernel association test with a flexible choice of the loss function, no distributional assumptions, and has SKAT and QRKM as special cases. We evaluate our proposed robust association test (RobKAT) across various data distributions through a simulation study. When errors are normally distributed, RobKAT controls type I error and shows comparable power with SKAT. In all other distributional settings investigated, our robust test has similar or greater power than SKAT. Finally, we apply our robust testing method to data from the Clinical Antipsychotic Trials of Intervention Effectiveness (CATIE) clinical trial to detect associations between selected genes including the major histocompatibility complex (MHC) region on chromosome six and neurotropic herpesvirus antibody levels in schizophrenia patients. RobKAT detected significant association with four SNP sets (HST1H2BJ, MHC, POM12L2, and SLC17A1), three of which were undetected by SKAT.}, number={3}, journal={GENETIC EPIDEMIOLOGY}, author={Martinez, Kara and Maity, Arnab and Yolken, Robert H. and Sullivan, Patrick F. and Tzeng, Jung-Ying}, year={2020}, month={Apr}, pages={272–282} } @article{huang_tzeng_maguire_hoyo_allen_2020, title={The association between neuraxial anesthesia and the development of childhood asthma - a secondary analysis of the newborn epigenetics study cohort}, volume={36}, ISSN={["1473-4877"]}, DOI={10.1080/03007995.2020.1747417}, abstractNote={Abstract Objectives Childhood asthma is a common chronic illness that has been associated with mode of delivery. However, the effect of cesarean delivery alone does not fully account for the increased prevalence of childhood asthma. We tested the hypothesis that neuraxial anesthesia used for labor analgesia and cesarean delivery alters the risk of developing childhood asthma. Methods Within the Newborn Epigenetics Study birth cohort, 196 mother and child pairs with entries in the electronic anesthesia records were included. From these records, data on maternal anesthesia type, duration of exposure, and drugs administered peripartum were abstracted and combined with questionnaire-derived prenatal risk factors and medical records and questionnaire-derived asthma diagnosis data in children. Logistic regression models were used to evaluate associations between type of anesthesia, duration of anesthesia, and the development of asthma in males and females. Results We found that longer duration of epidural anesthesia was associated with a lower risk of asthma in male children (OR = 0.80; 95% CI = 0.66–0.95) for each hour of epidural exposure. Additionally, a unit increase in the composite dose of local anesthetics and opioid analgesics administered via the spinal route was associated with a lower risk of asthma in both male (OR = 0.59, 95% CI = 0.36–0.96) and female children (OR 0.26, 95% CI 0.09–0.82). Conclusion Our data suggest that peripartum exposure to neuraxial anesthesia may reduce the risk of childhood asthma primarily in males. Larger human studies and model systems with longer follow-up are required to elucidate these findings.}, number={6}, journal={CURRENT MEDICAL RESEARCH AND OPINION}, author={Huang, Yueyang and Tzeng, Jung-Ying and Maguire, Rachel and Hoyo, Cathrine and Allen, Terrence}, year={2020}, month={Jun}, pages={1025–1032} } @article{west_lu_rotroff_kuenemann_chang_wu_wagner_buse_motsinger-reif_fourches_et al._2019, title={Identifying individual risk rare variants using protein structure guided local tests (POINT)}, volume={15}, ISSN={["1553-7358"]}, DOI={10.1371/journal.pcbi.1006722}, abstractNote={Rare variants are of increasing interest to genetic association studies because of their etiological contributions to human complex diseases. Due to the rarity of the mutant events, rare variants are routinely analyzed on an aggregate level. While aggregation analyses improve the detection of global-level signal, they are not able to pinpoint causal variants within a variant set. To perform inference on a localized level, additional information, e.g., biological annotation, is often needed to boost the information content of a rare variant. Following the observation that important variants are likely to cluster together on functional domains, we propose a protein structure guided local test (POINT) to provide variant-specific association information using structure-guided aggregation of signal. Constructed under a kernel machine framework, POINT performs local association testing by borrowing information from neighboring variants in the 3-dimensional protein space in a data-adaptive fashion. Besides merely providing a list of promising variants, POINT assigns each variant a p-value to permit variant ranking and prioritization. We assess the selection performance of POINT using simulations and illustrate how it can be used to prioritize individual rare variants in PCSK9, ANGPTL4 and CETP in the Action to Control Cardiovascular Risk in Diabetes (ACCORD) clinical trial data.}, number={2}, journal={PLOS COMPUTATIONAL BIOLOGY}, author={West, Rachel Marceau and Lu, Wenbin and Rotroff, Daniel M. and Kuenemann, Melaine A. and Chang, Sheng-Mao and Wu, Michael C. and Wagner, Michael J. and Buse, John B. and Motsinger-Reif, Alison A. and Fourches, Denis and et al.}, year={2019}, month={Feb} } @article{chang_tsai_tzeng_yeh_chen_lai_liao_hua_tsai_huang_et al._2019, title={Reference equations for spirometry in healthy Asian children aged 5 to 18 years in Taiwan}, volume={12}, ISSN={["1939-4551"]}, DOI={10.1016/j.waojou.2019.100074}, abstractNote={This study aimed to establish reference equations for spirometry in healthy Taiwanese children and assess the applicability of the Global Lung Function Initiative (GLI)-2012 equations to Taiwanese children.Spirometric data collected from 757 healthy Taiwanese children aged 5 to 18 years in a population-based cohort study. Prediction equations derived using linear regression and the generalized additive models for location, scale and shape (GAMLSS) method, respectively.The GLI-2012 South East Asian equations did not provide a close fit with mean ± standard error z-scores of -0.679 ± 0.030 (FVC), -0.186 ± 0.044 (FEV1), -0.875 ± 0.049 (FEV1/FVC ratio) and -2.189 ± 0.063 (FEF25-75) for girls; and 0.238 ± 0.059, -0.061 ± 0.053, -0.513 ± 0.059 and -1.896 ± 0.077 for boys. The proposed GAMLSS models took age, height, and weight into account. GAMLSS models for boys and girls captured the characteristics of spirometric data in the study population closely in contrast to the linear regression models and the GLI-2012 equations.This study provides up-to-date reference values for spirometry using GAMLSS modeling in healthy Taiwanese children aged 5 to 18 years. Our study provides evidence that the GLI-2012 reference equations are not properly matched to spirometric data in a contemporary Taiwanese child population, indicating the urgent need for an update of GLI reference values by inclusion of more data of non-Caucasian decent.}, number={11}, journal={WORLD ALLERGY ORGANIZATION JOURNAL}, author={Chang, Sheng-Mao and Tsai, Hui-Ju and Tzeng, Jung-Ying and Yeh, Kuo-Wei and Chen, Li-Chen and Lai, Shen-Hao and Liao, Sui-Ling and Hua, Man-Chin and Tsai, Ming-Han and Huang, Jing-Long and et al.}, year={2019}, month={Nov} } @article{szatkiewicz_marceau_yilmaz_bulik_crowley_mattheisen_sullivan_lu_maity_tzeng_et al._2019, title={VARIANCE COMPONENT TEST FOR CROSS-DISORDER PATHWAY ANALYSIS}, volume={29}, ISSN={["1873-7862"]}, DOI={10.1016/j.euroneuro.2018.08.252}, journal={EUROPEAN NEUROPSYCHOPHARMACOLOGY}, author={Szatkiewicz, Jin and Marceau, Rachel and Yilmaz, Zeynep and Bulik, Cynthia and Crowley, James and Mattheisen, Manuel and Sullivan, Patrick and Lu, Wenbin and Maity, Arnab and Tzeng, Jung-Ying and et al.}, year={2019}, pages={1204–1205} } @article{green_hoyo_mattingly_luo_tzeng_murphy_buchwalter_planchart_2018, title={Cadmium exposure increases the risk of juvenile obesity: a human and zebrafish comparative study}, volume={42}, ISSN={0307-0565 1476-5497}, url={http://dx.doi.org/10.1038/S41366-018-0036-Y}, DOI={10.1038/S41366-018-0036-Y}, abstractNote={{"Label"=>"OBJECTIVE"} Human obesity is a complex metabolic disorder disproportionately affecting people of lower socioeconomic strata, and ethnic minorities, especially African Americans and Hispanics. Although genetic predisposition and a positive energy balance are implicated in obesity, these factors alone do not account for the excess prevalence of obesity in lower socioeconomic populations. Therefore, environmental factors, including exposure to pesticides, heavy metals, and other contaminants, are agents widely suspected to have obesogenic activity, and they also are spatially correlated with lower socioeconomic status. Our study investigates the causal relationship between exposure to the heavy metal, cadmium (Cd), and obesity in a cohort of children and in a zebrafish model of adipogenesis. {"Label"=>"DESIGN"} An extensive collection of first trimester maternal blood samples obtained as part of the Newborn Epigenetics Study (NEST) was analyzed for the presence of Cd, and these results were cross analyzed with the weight-gain trajectory of the children through age 5 years. Next, the role of Cd as a potential obesogen was analyzed in an in vivo zebrafish model. {"Label"=>"RESULTS"} Our analysis indicates that the presence of Cd in maternal blood during pregnancy is associated with increased risk of juvenile obesity in the offspring, independent of other variables, including lead (Pb) and smoking status. Our results are recapitulated in a zebrafish model, in which exposure to Cd at levels approximating those observed in the NEST study is associated with increased adiposity. {"Label"=>"CONCLUSION"} Our findings identify Cd as a potential human obesogen. Moreover, these observations are recapitulated in a zebrafish model, suggesting that the underlying mechanisms may be evolutionarily conserved, and that zebrafish may be a valuable model for uncovering pathways leading to Cd-mediated obesity in human populations.}, number={7}, journal={International Journal of Obesity}, publisher={Springer Science and Business Media LLC}, author={Green, Adrian J. and Hoyo, Cathrine and Mattingly, Carolyn J. and Luo, Yiwen and Tzeng, Jung-Ying and Murphy, Susan K. and Buchwalter, David B. and Planchart, Antonio}, year={2018}, month={Feb}, pages={1285–1295} } @article{jeng_zhang_tzeng_2018, title={Efficient Signal Inclusion With Genomic Applications}, volume={114}, ISSN={0162-1459 1537-274X}, url={http://dx.doi.org/10.1080/01621459.2018.1518236}, DOI={10.1080/01621459.2018.1518236}, abstractNote={Abstract This article addresses the challenge of efficiently capturing a high proportion of true signals for subsequent data analyses when sample sizes are relatively limited with respect to data dimension. We propose the signal missing rate (SMR) as a new measure for false-negative control to account for the variability of false-negative proportion. Novel data-adaptive procedures are developed to control SMR without incurring many unnecessary false positives under dependence. We justify the efficiency and adaptivity of the proposed methods via theory and simulation. The proposed methods are applied to GWAS on human height to effectively remove irrelevant single nucleotide polymorphisms (SNPs) while retaining a high proportion of relevant SNPs for subsequent polygenic analysis. Supplementary materials for this article are available online.}, number={528}, journal={Journal of the American Statistical Association}, publisher={Informa UK Limited}, author={Jeng, X. Jessie and Zhang, Teng and Tzeng, Jung-Ying}, year={2018}, month={Sep}, pages={1–23} } @misc{yao_chung_lin_tsai_chang_yeh_tsai_liao_hua_lai_et al._2019, title={Genetic loci determining total immunoglobulin E levels from birth through adulthood}, volume={74}, ISSN={["1398-9995"]}, DOI={10.1111/all.13654}, abstractNote={ferentiation of the cutaneous microbiota in psoriasis. Microbiome. 2013;1:31. 6. Chng KR, Tay AS, Li C, et al. Whole metagenome profiling reveals skin microbiome‐dependent susceptibility to atopic dermatitis flare. Nat Microbiol. 2016;1:16106. 7. St Laurent G 3rd, Seilheimer B, Tackett M, et al. Deep sequencing transcriptome analysis of murine wound healing: effects of a multicomponent, Multitarget Natural Product Therapy‐Tr14. Front Mol Biosci. 2017;4:57. 8. Caley MP, Martins VL, O'Toole EA. Metalloproteinases and wound healing. Adv Wound Care (New Rochelle). 2015;4:225‐234. 9. Hoffmann AR, Patterson AP, Diesel A, et al. The skin microbiome in healthy and allergic dogs. PLoS One. 2014;9:e83197.}, number={3}, journal={ALLERGY}, author={Yao, Tsung-Chieh and Chung, Ren-Hua and Lin, Chung-Yen and Tsai, Pei-Chien and Chang, Wei-Chiao and Yeh, Kuo-Wei and Tsai, Ming-Han and Liao, Sui-Ling and Hua, Man-Chin and Lai, Shen-Hao and et al.}, year={2019}, month={Mar}, pages={621–625} } @article{maity_zhao_sullivan_tzeng_2018, title={Inference on phenotype-specific effects of genes using multivariate kernel machine regression}, volume={42}, ISSN={["1098-2272"]}, url={https://doi.org/10.1002/gepi.22096}, DOI={10.1002/gepi.22096}, abstractNote={ABSTRACTWe consider the problem of assessing the joint effect of a set of genetic markers on multiple, possibly correlated phenotypes of interest. We develop a kernel machine based multivariate regression framework, where the joint effect of the marker set on each of the phenotypes is modeled using prespecified kernel functions with unknown variance components. Unlike most existing methods that mainly focus on the global association between the marker set and the phenotype set, we develop estimation and testing procedures to study phenotype‐specific associations. Specifically, we develop an estimation method based on the penalized likelihood approach to estimate phenotype‐specific effects and their corresponding standard errors while accounting for possible correlation among the phenotypes. We develop testing procedures for the association of the marker set with any subset of phenotypes using a score‐based variance components testing method. We assess the performance of our proposed methodology via a simulation study and demonstrate the utility of the proposed method using the Clinical Antipsychotic Trials of Intervention Effectiveness (CATIE) data.}, number={1}, journal={GENETIC EPIDEMIOLOGY}, publisher={Wiley-Blackwell}, author={Maity, Arnab and Zhao, Jing and Sullivan, Patrick F. and Tzeng, Jung-Ying}, year={2018}, month={Feb}, pages={64–79} } @article{wang_tzeng_wu_preisig_hsiao_2018, title={Reexamining Dis/Similarity-Based Tests for Rare-Variant Association with Case-Control Samples}, volume={209}, ISSN={["1943-2631"]}, DOI={10.1534/genetics.118.300769}, abstractNote={Abstract A properly designed distance-based measure can capture informative genetic differences among individuals with different phenotypes and can be used to detect variants responsible for the phenotypes. To detect associated variants, various tests have been designed to contrast genetic dissimilarity or similarity scores of certain subject groups in different ways, among which the most widely used strategy is to quantify the difference between the within-group genetic dissimilarity/similarity (i.e., case-case and control-control similarities) and the between-group dissimilarity/similarity (i.e., case-control similarities). While it has been noted that for common variants, the within-group and the between-group measures should all be included; in this work, we show that for rare variants, comparison based on the two within-group measures can more effectively quantify the genetic difference between cases and controls. The between-group measure tends to overlap with one of the two within-group measures for rare variants, although such overlap is not present for common variants. Consequently, a dissimilarity or similarity test that includes the between-group information tends to attenuate the association signals and leads to power loss. Based on these findings, we propose a dissimilarity test that compares the degree of SNP dissimilarity within cases to that within controls to better characterize the difference between two disease phenotypes. We provide the statistical properties, asymptotic distribution, and computation details for a small sample size of the proposed test. We use simulated and real sequence data to assess the performance of the proposed test, comparing it with other rare-variant methods including those similarity-based tests that use both within-group and between-group information. As similarity-based approaches serve as one of the dominating approaches in rare-variant analysis, our results provide some insight for the effective detection of rare variants.}, number={1}, journal={GENETICS}, author={Wang, Charlotte and Tzeng, Jung-Ying and Wu, Pei-Zhen and Preisig, Martin and Hsiao, Chuhsing Kate}, year={2018}, month={May}, pages={105–113} } @article{davenport_maity_sullivan_tzeng_2017, title={A Powerful Test for SNP Effects on Multivariate Binary Outcomes Using Kernel Machine Regression}, volume={10}, ISSN={1867-1764 1867-1772}, url={http://dx.doi.org/10.1007/S12561-017-9189-9}, DOI={10.1007/S12561-017-9189-9}, abstractNote={Evaluating multiple binary outcomes is common in genetic studies of complex diseases. These outcomes are often correlated because they are collected from the same individual and they may share common marker effects. In this paper, we propose a procedure to test for effect of a single nucleotide polymorphism-set on multiple, possibly correlated, binary responses. We develop a score-based test using a non-parametric modeling framework that jointly models the global effect of the marker set. We account for the non-linear effects and potentially complicated interaction between markers using reproducing kernels. Our testing procedure only requires estimation under the null hypothesis and we use multivariate generalized estimating equations to estimate the model components to account for the correlation among the outcomes. We evaluate finite sample performance of our test via simulation study and demonstrate our methods using the Clinical Antipsychotic Trials of Intervention Effectiveness antibody study data and the CoLaus study data.}, number={1}, journal={Statistics in Biosciences}, publisher={Springer Science and Business Media LLC}, author={Davenport, Clemontina A. and Maity, Arnab and Sullivan, Patrick F. and Tzeng, Jung-Ying}, year={2017}, month={Mar}, pages={117–138} } @article{szatkiewicz_tzeng_magnusson_sullivan_2017, title={A new method for detecting associations with rare copy-number variants}, volume={27}, journal={European Neuropsychopharmacology}, author={Szatkiewicz, J. and Tzeng, J. Y. and Magnusson, P. and Sullivan, P.}, year={2017}, pages={S165–166} } @article{chang_tzeng_chen_2017, title={Fast Bayesian variable screenings for binary response regressions with small sample size}, volume={87}, ISSN={["1563-5163"]}, DOI={10.1080/00949655.2017.1341887}, abstractNote={ABSTRACT Screening procedures play an important role in data analysis, especially in high-throughput biological studies where the datasets consist of more covariates than independent subjects. In this article, a Bayesian screening procedure is introduced for the binary response models with logit and probit links. In contrast to many screening rules based on marginal information involving one or a few covariates, the proposed Bayesian procedure simultaneously models all covariates and uses closed-form screening statistics. Specifically, we use the posterior means of the regression coefficients as screening statistics; by imposing a generalized g-prior on the regression coefficients, we derive the analytical form of their posterior means and compute the screening statistics without Markov chain Monte Carlo implementation. We evaluate the utility of the proposed Bayesian screening method using simulations and real data analysis. When the sample size is small, the simulation results suggest improved performance with comparable computational cost.}, number={14}, journal={JOURNAL OF STATISTICAL COMPUTATION AND SIMULATION}, author={Chang, S. -M. and Tzeng, J. -Y. and Chen, R. -B.}, year={2017}, pages={2708–2723} } @article{luo_mccullough_tzeng_darrah_vengosh_maguire_maity_samuel-hodge_murphy_mendez_et al._2017, title={Maternal blood cadmium, lead and arsenic levels, nutrient combinations, and offspring birthweight}, volume={17}, journal={BMC Public Health}, author={Luo, Y. W. and McCullough, L. E. and Tzeng, J. Y. and Darrah, T. and Vengosh, A. and Maguire, R. L. and Maity, A. and Samuel-Hodge, C. and Murphy, S. K. and Mendez, M. A. and et al.}, year={2017} } @article{luo_maity_wu_smith_duan_li_tzeng_2018, title={On the substructure controls in rare variant analysis: Principal components or variance components?}, volume={42}, ISSN={["1098-2272"]}, url={http://www.scopus.com/inward/record.url?eid=2-s2.0-85039155216&partnerID=MN8TOARS}, DOI={10.1002/gepi.22102}, abstractNote={AbstractRecent studies showed that population substructure (PS) can have more complex impact on rare variant tests and that similarity‐based collapsing tests (e.g., SKAT) may suffer more severely by PS than burden‐based tests. In this work, we evaluate the performance of SKAT coupling with principal components (PC) or variance components (VC) based PS correction methods. We consider confounding effects caused by PS including stratified populations, admixed populations, and spatially distributed nongenetic risk; we investigate which types of variants (e.g., common, less frequent, rare, or all variants) should be used to effectively control for confounding effects. We found that (i) PC‐based methods can account for confounding effects in most scenarios except for admixture, although the number of sufficient PCs depends on the PS complexity and the type of variants used. (ii) PCs based on all variants (i.e., common + less frequent + rare) tend to require equal or fewer sufficient PCs and often achieve higher power than PCs based on other variant types. (iii) VC‐based methods can effectively adjust for confounding in all scenarios (even for admixture), though the type of variants should be used to construct VC may vary. (iv) VC based on all variants works consistently in all scenarios, though its power may be sometimes lower than VC based on other variant types. Given that the best‐performed method and which variants to use depend on the underlying unknown confounding mechanisms, a robust strategy is to perform SKAT analyses using VC‐based methods based on all variants.}, number={3}, journal={GENETIC EPIDEMIOLOGY}, author={Luo, Yiwen and Maity, Arnab and Wu, Michael C. and Smith, Chris and Duan, Qing and Li, Yun and Tzeng, Jung-Ying}, year={2018}, month={Apr}, pages={276–287} } @misc{kong_maity_hsu_tzeng_2018, title={Rejoinder to "A note on testing and estimation in marker-set association study using semiparametric quantile regression kernel machine"}, volume={74}, ISSN={["1541-0420"]}, url={http://www.scopus.com/inward/record.url?eid=2-s2.0-85032786553&partnerID=MN8TOARS}, DOI={10.1111/biom.12786}, abstractNote={Dehan Kong , Arnab Maity, Fang-Chi Hsu, and Jung-Ying Tzeng Department of Statistical Sciences, University of Toronto, Ontario, Canada Department of Statistics, North Carolina State University, North Carolina, U.S.A. Department of Biostatistical Sciences, Wake Forest University, North Carolina, U.S.A. Department of Statistics and Bioinformatics Research Center North Carolina State University, North Carolina, U.S.A. Department of Statistics, National Cheng-Kung University, Taiwan ∗email: kongdehan@utstat.toronto.edu}, number={2}, journal={BIOMETRICS}, author={Kong, Dehan and Maity, Arnab and Hsu, Fang-Chi and Tzeng, Jung-Ying}, year={2018}, month={Jun}, pages={767–768} } @article{zhang_huang_xu_tzeng_conneely_guan_kang_li_2016, title={Across-Platform Imputation of DNA Methylation Levels Incorporating Nonlocal Information Using Penalized Functional Regression}, volume={40}, ISSN={["1098-2272"]}, DOI={10.1002/gepi.21969}, abstractNote={ABSTRACTDNA methylation is a key epigenetic mark involved in both normal development and disease progression. Recent advances in high‐throughput technologies have enabled genome‐wide profiling of DNA methylation. However, DNA methylation profiling often employs different designs and platforms with varying resolution, which hinders joint analysis of methylation data from multiple platforms. In this study, we propose a penalized functional regression model to impute missing methylation data. By incorporating functional predictors, our model utilizes information from nonlocal probes to improve imputation quality. Here, we compared the performance of our functional model to linear regression and the best single probe surrogate in real data and via simulations. Specifically, we applied different imputation approaches to an acute myeloid leukemia dataset consisting of 194 samples and our method showed higher imputation accuracy, manifested, for example, by a 94% relative increase in information content and up to 86% more CpG sites passing post‐imputation filtering. Our simulated association study further demonstrated that our method substantially improves the statistical power to identify trait‐associated methylation loci. These findings indicate that the penalized functional regression model is a convenient and valuable imputation tool for methylation data, and it can boost statistical power in downstream epigenome‐wide association study (EWAS).}, number={4}, journal={GENETIC EPIDEMIOLOGY}, author={Zhang, Guosheng and Huang, Kuan-Chieh and Xu, Zheng and Tzeng, Jung-Ying and Conneely, Karen N. and Guan, Weihua and Kang, Jian and Li, Yun}, year={2016}, month={May}, pages={333–340} } @article{jeng_daye_lu_tzeng_2016, title={Rare variants association analysis in large-scale sequencing studies at the single locus level}, volume={12}, number={6}, journal={PLoS Computational Biology}, author={Jeng, X. J. and Daye, Z. J. and Lu, W. B. and Tzeng, J. Y.}, year={2016} } @article{marceau_lu_holloway_sale_worrall_williams_hsu_tzeng_2015, title={A Fast Multiple-Kernel Method With Applications to Detect Gene-Environment Interaction}, volume={39}, ISSN={["1098-2272"]}, DOI={10.1002/gepi.21909}, abstractNote={ABSTRACTKernel machine (KM) models are a powerful tool for exploring associations between sets of genetic variants and complex traits. Although most KM methods use a single kernel function to assess the marginal effect of a variable set, KM analyses involving multiple kernels have become increasingly popular. Multikernel analysis allows researchers to study more complex problems, such as assessing gene‐gene or gene‐environment interactions, incorporating variance‐component based methods for population substructure into rare‐variant association testing, and assessing the conditional effects of a variable set adjusting for other variable sets. The KM framework is robust, powerful, and provides efficient dimension reduction for multifactor analyses, but requires the estimation of high dimensional nuisance parameters. Traditional estimation techniques, including regularization and the “expectation‐maximization (EM)” algorithm, have a large computational cost and are not scalable to large sample sizes needed for rare variant analysis. Therefore, under the context of gene‐environment interaction, we propose a computationally efficient and statistically rigorous “fastKM” algorithm for multikernel analysis that is based on a low‐rank approximation to the nuisance effect kernel matrices. Our algorithm is applicable to various trait types (e.g., continuous, binary, and survival traits) and can be implemented using any existing single‐kernel analysis software. Through extensive simulation studies, we show that our algorithm has similar performance to an EM‐based KM approach for quantitative traits while running much faster. We also apply our method to the Vitamin Intervention for Stroke Prevention (VISP) clinical trial, examining gene‐by‐vitamin effects on recurrent stroke risk and gene‐by‐age effects on change in homocysteine level.}, number={6}, journal={GENETIC EPIDEMIOLOGY}, author={Marceau, Rachel and Lu, Wenbin and Holloway, Shannon and Sale, Michele M. and Worrall, Bradford B. and Williams, Stephen R. and Hsu, Fang-Chi and Tzeng, Jung-Ying}, year={2015}, month={Sep}, pages={456–468} } @article{tzeng_magnusson_sullivan_szatkiewicz_2015, title={A new method for detecting associations with rare copy-number variants}, volume={11}, number={10}, journal={PLoS Genetics}, author={Tzeng, J. Y. and Magnusson, P. K. E. and Sullivan, P. F. and Szatkiewicz, J. P.}, year={2015} } @article{neely_bondell_tzeng_2015, title={A penalized likelihood approach for investigating gene-drug interactions in pharmacogenetic studies}, volume={71}, ISSN={["1541-0420"]}, DOI={10.1111/biom.12259}, abstractNote={Summary Pharmacogenetics investigates the relationship between heritable genetic variation and the variation in how individuals respond to drug therapies. Often, gene–drug interactions play a primary role in this response, and identifying these effects can aid in the development of individualized treatment regimes. Haplotypes can hold key information in understanding the association between genetic variation and drug response. However, the standard approach for haplotype-based association analysis does not directly address the research questions dictated by individualized medicine. A complementary post-hoc analysis is required, and this post-hoc analysis is usually under powered after adjusting for multiple comparisons and may lead to seemingly contradictory conclusions. In this work, we propose a penalized likelihood approach that is able to overcome the drawbacks of the standard approach and yield the desired personalized output. We demonstrate the utility of our method by applying it to the Scottish Randomized Trial in Ovarian Cancer. We also conducted simulation studies and showed that the proposed penalized method has comparable or more power than the standard approach and maintains low Type I error rates for both binary and quantitative drug responses. The largest performance gains are seen when the haplotype frequency is low, the difference in effect sizes are small, or the true relationship among the drugs is more complex.}, number={2}, journal={BIOMETRICS}, author={Neely, Megan L. and Bondell, Howard D. and Tzeng, Jung-Ying}, year={2015}, month={Jun}, pages={529–537} } @article{zhao_marceau_zhang_tzeng_2015, title={Assessing gene-environment interactions for common and rare variants with binary traits using gene-trait similarity regression}, volume={199}, number={3}, journal={Genetics}, author={Zhao, G. L. and Marceau, R. and Zhang, D. W. and Tzeng, J. Y.}, year={2015}, pages={695-} } @article{hung_lin_chen_wang_huang_tzeng_2016, title={Detection of Gene-Gene Interactions Using Multistage Sparse and Low-Rank Regression}, volume={72}, ISSN={["1541-0420"]}, DOI={10.1111/biom.12374}, abstractNote={Summary Finding an efficient and computationally feasible approach to deal with the curse of high-dimensionality is a daunting challenge faced by modern biological science. The problem becomes even more severe when the interactions are the research focus. To improve the performance of statistical analyses, we propose a sparse and low-rank (SLR) screening based on the combination of a low-rank interaction model and the Lasso screening. SLR models the interaction effects using a low-rank matrix to achieve parsimonious parametrization. The low-rank model increases the efficiency of statistical inference and, hence, SLR screening is able to more accurately detect gene–gene interactions than conventional methods. Incorporation of SLR screening into the Screen-and-Clean approach (Wasserman and Roeder, 2009; Wu et al., 2010) is also discussed, which suffers less penalty from Boferroni correction, and is able to assign p-values for the identified variables in high-dimensional model. We apply the proposed screening procedure to the Warfarin dosage study and the CoLaus study. The results suggest that the new procedure can identify main and interaction effects that would have been omitted by conventional screening methods.}, number={1}, journal={BIOMETRICS}, author={Hung, Hung and Lin, Yu-Ting and Chen, Penweng and Wang, Chen-Chien and Huang, Su-Yun and Tzeng, Jung-Ying}, year={2016}, month={Mar}, pages={85–94} } @article{maier_moser_chen_ripke_coryell_potash_scheftner_shi_weissman_hultman_et al._2015, title={Joint Analysis of Psychiatric Disorders Increases Accuracy of Risk Prediction for Schizophrenia, Bipolar Disorder, and Major Depressive Disorder}, volume={96}, ISSN={0002-9297}, url={http://dx.doi.org/10.1016/J.AJHG.2014.12.006}, DOI={10.1016/J.AJHG.2014.12.006}, abstractNote={Genetic risk prediction has several potential applications in medical research and clinical practice and could be used, for example, to stratify a heterogeneous population of patients by their predicted genetic risk. However, for polygenic traits, such as psychiatric disorders, the accuracy of risk prediction is low. Here we use a multivariate linear mixed model and apply multi-trait genomic best linear unbiased prediction for genetic risk prediction. This method exploits correlations between disorders and simultaneously evaluates individual risk for each disorder. We show that the multivariate approach significantly increases the prediction accuracy for schizophrenia, bipolar disorder, and major depressive disorder in the discovery as well as in independent validation datasets. By grouping SNPs based on genome annotation and fitting multiple random effects, we show that the prediction accuracy could be further improved. The gain in prediction accuracy of the multivariate approach is equivalent to an increase in sample size of 34% for schizophrenia, 68% for bipolar disorder, and 76% for major depressive disorders using single trait models. Because our approach can be readily applied to any number of GWAS datasets of correlated traits, it is a flexible and powerful tool to maximize prediction accuracy. With current sample size, risk predictors are not useful in a clinical setting but already are a valuable research tool, for example in experimental designs comparing cases with high and low polygenic risk.}, number={2}, journal={The American Journal of Human Genetics}, publisher={Elsevier BV}, author={Maier, Robert and Moser, Gerhard and Chen, Guo-Bo and Ripke, Stephan and Coryell, William and Potash, James B. and Scheftner, William A. and Shi, Jianxin and Weissman, Myrna M. and Hultman, Christina M. and et al.}, year={2015}, month={Feb}, pages={283–294} } @article{wang_maity_hsiao_voora_kaddurah-daouk_tzeng_2015, title={Module-based association analysis for omics data with network structure}, volume={10}, number={3}, journal={PLoS One}, author={Wang, Z. and Maity, A. and Hsiao, C. K. and Voora, D. and Kaddurah-Daouk, R. and Tzeng, J. Y.}, year={2015} } @article{hu_sun_tzeng_perou_2015, title={Proper Use of Allele-Specific Expression Improves Statistical Power for cis-eQTL Mapping with RNA-Seq Data}, volume={110}, ISSN={["1537-274X"]}, DOI={10.1080/01621459.2015.1038449}, abstractNote={Studies of expression quantitative trait loci (eQTLs) offer insight into the molecular mechanisms of loci that were found to be associated with complex diseases and the mechanisms can be classified into cis- and trans-acting regulation. At present, high-throughput RNA sequencing (RNA-seq) is rapidly replacing expression microarrays to assess gene expression abundance. Unlike microarrays that only measure the total expression of each gene, RNA-seq also provides information on allele-specific expression (ASE), which can be used to distinguish cis-eQTLs from trans-eQTLs and, more importantly, enhance cis-eQTL mapping. However, assessing the cis-effect of a candidate eQTL on a gene requires knowledge of the haplotypes connecting the candidate eQTL and the gene, which can not be inferred with certainty. The existing two-stage approach that first phases the candidate eQTL against the gene and then treats the inferred phase as observed in the association analysis tends to attenuate the estimated cis-effect and reduce the power for detecting a cis-eQTL. In this article, we provide a maximum-likelihood framework for cis-eQTL mapping with RNA-seq data. Our approach integrates the inference of haplotypes and the association analysis into a single stage, and is thus unbiased and statistically powerful. We also develop a pipeline for performing a comprehensive scan of all local eQTLs for all genes in the genome by controlling for false discovery rate, and implement the methods in a computationally efficient software program. The advantages of the proposed methods over the existing ones are demonstrated through realistic simulation studies and an application to empirical breast cancer data from The Cancer Genome Atlas project. Supplementary materials for this article are available online.}, number={511}, journal={JOURNAL OF THE AMERICAN STATISTICAL ASSOCIATION}, author={Hu, Yi-Juan and Sun, Wei and Tzeng, Jung-Ying and Perou, Charles M.}, year={2015}, month={Sep}, pages={962–974} } @article{tzeng_2015, title={Psychiatric genome-wide association study analyses implicate neuronal, immune and histone pathways}, volume={18}, ISSN={1097-6256 1546-1726}, url={http://dx.doi.org/10.1038/NN.3922}, DOI={10.1038/NN.3922}, abstractNote={Better analytical methods are needed to extract biological meaning from genome-wide association studies (GWAS) of psychiatric disorders. Here the authors take GWAS data from over 60,000 subjects, including patients with schizophrenia, bipolar disorder and major depression, and identify common etiological pathways shared amongst them. Genome-wide association studies (GWAS) of psychiatric disorders have identified multiple genetic associations with such disorders, but better methods are needed to derive the underlying biological mechanisms that these signals indicate. We sought to identify biological pathways in GWAS data from over 60,000 participants from the Psychiatric Genomics Consortium. We developed an analysis framework to rank pathways that requires only summary statistics. We combined this score across disorders to find common pathways across three adult psychiatric disorders: schizophrenia, major depression and bipolar disorder. Histone methylation processes showed the strongest association, and we also found statistically significant evidence for associations with multiple immune and neuronal signaling pathways and with the postsynaptic density. Our study indicates that risk variants for psychiatric disorders aggregate in particular biological pathways and that these pathways are frequently shared between disorders. Our results confirm known mechanisms and suggest several novel insights into the etiology of psychiatric disorders.}, number={2}, journal={Nature Neuroscience}, publisher={Springer Science and Business Media LLC}, author={Tzeng, Jung-Ying}, year={2015}, month={Jan}, pages={199–209} } @article{o'dushlaine_rossin_lee_duncan_parikshak_newhouse_ripke_neale_purcell_posthuma_et al._2015, title={Psychiatric genome-wide association study analyses implicate neuronal, immune and histone pathways}, volume={18}, number={2}, journal={Nature Neuroscience}, author={O'Dushlaine, C. and Rossin, L. and Lee, P. H. and Duncan, L. and Parikshak, N. N. and Newhouse, S. and Ripke, S. and Neale, B. M. and Purcell, S. M. and Posthuma, D. and et al.}, year={2015}, pages={199–209} } @article{kong_maity_hsu_tzeng_biometrics_2016, title={Testing and estimation in marker-set association study using semiparametric quantile regression kernel machine}, volume={72}, ISSN={["1541-0420"]}, url={http://www.scopus.com/inward/record.url?eid=2-s2.0-84978986071&partnerID=MN8TOARS}, DOI={10.1111/biom.12438}, abstractNote={Summary We consider quantile regression for partially linear models where an outcome of interest is related to covariates and a marker set (e.g., gene or pathway). The covariate effects are modeled parametrically and the marker set effect of multiple loci is modeled using kernel machine. We propose an efficient algorithm to solve the corresponding optimization problem for estimating the effects of covariates and also introduce a powerful test for detecting the overall effect of the marker set. Our test is motivated by traditional score test, and borrows the idea of permutation test. Our estimation and testing procedures are evaluated numerically and applied to assess genetic association of change in fasting homocysteine level using the Vitamin Intervention for Stroke Prevention Trial data.}, number={2}, journal={BIOMETRICS}, author={Kong, D. and Maity, A. and Hsu, F.C. and Tzeng, J.Y. and Biometrics}, year={2016}, month={Jun}, pages={364–371} } @article{wang_epstein_tzeng_2014, title={Analysis of Gene-Gene Interactions Using Gene-Trait Similarity Regression}, volume={78}, ISSN={["1423-0062"]}, DOI={10.1159/000360161}, abstractNote={Objective: Gene-gene interactions (G×G) are important to study because of their extensiveness in biological systems and their potential in explaining missing heritability of complex traits. In this work, we propose a new similarity-based test to assess G×G at the gene level, which permits the study of epistasis at biologically functional units with amplified interaction signals. Methods: Under the framework of gene-trait similarity regression (SimReg), we propose a gene-based test for detecting G×G. SimReg uses a regression model to correlate trait similarity with genotypic similarity across a gene. Unlike existing gene-level methods based on leading principal components (PCs), SimReg summarizes all information on genotypic variation within a gene and can be used to assess the joint/interactive effects of two genes as well as the effect of one gene conditional on another. Results: Using simulations and a real data application to the Warfarin study, we show that the SimReg G×G tests have satisfactory power and robustness under different genetic architecture when compared to existing gene-based interaction tests such as PC analysis or partial least squares. A genome-wide association study with approx. 20,000 genes may be completed on a parallel computing system in 2 weeks.}, number={1}, journal={HUMAN HEREDITY}, author={Wang, Xin and Epstein, Michael P. and Tzeng, Jung-Ying}, year={2014}, pages={17–26} } @article{wang_maity_luo_neely_tzeng_2015, title={Complete Effect-Profile Assessment in Association Studies With Multiple Genetic and Multiple Environmental Factors}, volume={39}, ISSN={["1098-2272"]}, url={http://www.scopus.com/inward/record.url?eid=2-s2.0-84921048438&partnerID=MN8TOARS}, DOI={10.1002/gepi.21877}, abstractNote={ABSTRACTStudying complex diseases in the post genome‐wide association studies (GWAS) era has led to developing methods that consider factor‐sets rather than individual genetic/environmental factors (i.e., Multi‐G‐Multi‐E studies), and mining for potential gene‐environment (G×E) interactions has proven to be an invaluable aid in both discovery and deciphering underlying biological mechanisms. Current approaches for examining effect profiles in Multi‐G‐Multi‐E analyses are either underpowered due to large degrees of freedom, ill‐suited for detecting G×E interactions due to imprecise modeling of the G and E effects, or lack of capacity for modeling interactions between two factor‐sets (e.g., existing methods focus primarily on a single E factor). In this work, we illustrate the issues encountered in constructing kernels for investigating interactions between two factor‐sets, and propose a simple yet intuitive solution to construct the G×E kernel that retains the ease‐of‐interpretation of classic regression. We also construct a series of kernel machine (KM) score tests to evaluate the complete effect profile (i.e., the G, E, and G×E effects individually or in combination). We show, via simulations and a data application, that the proposed KM methods outperform the classic and PC regressions across a range of scenarios, including varying effect size, effect structure, and interaction complexity. The largest power gain was observed when the underlying effect structure involved complex G×E interactions; however, the proposed methods have consistent, powerful performance when the effect profile is simple or complex, suggesting that the proposed method could be a useful tool for exploratory or confirmatory G×E analysis.}, number={2}, journal={GENETIC EPIDEMIOLOGY}, publisher={Wiley-Blackwell}, author={Wang, Zhi and Maity, Arnab and Luo, Yiwen and Neely, Megan L. and Tzeng, Jung-Ying}, year={2015}, month={Feb}, pages={122–133} } @article{tzeng_lu_hsu_2014, title={GENE-LEVEL PHARMACOGENETIC ANALYSIS ON SURVIVAL OUTCOMES USING GENE-TRAIT SIMILARITY REGRESSION}, volume={8}, ISSN={["1932-6157"]}, DOI={10.1214/14-aoas735}, abstractNote={Gene/pathway-based methods are drawing significant attention due to their usefulness in detecting rare and common variants that affect disease susceptibility. The biological mechanism of drug responses indicates that a gene-based analysis has even greater potential in pharmacogenetics. Motivated by a study from the Vitamin Intervention for Stroke Prevention (VISP) trial, we develop a gene-trait similarity regression for survival analysis to assess the effect of a gene or pathway on time-to-event outcomes. The similarity regression has a general framework that covers a range of survival models, such as the proportional hazards model and the proportional odds model. The inference procedure developed under the proportional hazards model is robust against model misspecification. We derive the equivalence between the similarity survival regression and a random effects model, which further unifies the current variance-component based methods. We demonstrate the effectiveness of the proposed method through simulation studies. In addition, we apply the method to the VISP trial data to identify the genes that exhibit an association with the risk of a recurrent stroke. TCN2 gene was found to be associated with the recurrent stroke risk in the low-dose arm. This gene may impact recurrent stroke risk in response to cofactor therapy.}, number={2}, journal={ANNALS OF APPLIED STATISTICS}, author={Tzeng, Jung-Ying and Lu, Wenbin and Hsu, Fang-Chi}, year={2014}, month={Jun}, pages={1232–1255} } @article{wright_sullivan_brooks_zou_sun_xia_madar_jansen_chung_zhou_et al._2014, title={Heritability and genomics of gene expression in peripheral blood}, volume={46}, ISSN={1061-4036 1546-1718}, url={http://dx.doi.org/10.1038/NG.2951}, DOI={10.1038/NG.2951}, abstractNote={Fred Wright, Patrick Sullivan and colleagues present the results of a large expression QTL study of peripheral blood using a classic twin design with follow-up replication in independent samples. Their results enable a more precise estimate of the heritability of gene expression and provide a useful resource for exploring the genetic control of transcription. We assessed gene expression profiles in 2,752 twins, using a classic twin design to quantify expression heritability and quantitative trait loci (eQTLs) in peripheral blood. The most highly heritable genes (∼777) were grouped into distinct expression clusters, enriched in gene-poor regions, associated with specific gene function or ontology classes, and strongly associated with disease designation. The design enabled a comparison of twin-based heritability to estimates based on dizygotic identity-by-descent sharing and distant genetic relatedness. Consideration of sampling variation suggests that previous heritability estimates have been upwardly biased. Genotyping of 2,494 twins enabled powerful identification of eQTLs, which we further examined in a replication set of 1,895 unrelated subjects. A large number of non-redundant local eQTLs (6,756) met replication criteria, whereas a relatively small number of distant eQTLs (165) met quality control and replication standards. Our results provide a new resource toward understanding the genetic control of transcription.}, number={5}, journal={Nature Genetics}, publisher={Springer Science and Business Media LLC}, author={Wright, Fred A and Sullivan, Patrick F and Brooks, Andrew I and Zou, Fei and Sun, Wei and Xia, Kai and Madar, Vered and Jansen, Rick and Chung, Wonil and Zhou, Yi-Hui and et al.}, year={2014}, month={Apr}, pages={430–437} } @article{wright_sullivan_brooks_zou_sun_xia_madar_jansen_chung_zhou_et al._2014, title={Heritability and genomics of gene expression in peripheral blood}, volume={46}, number={5}, journal={Nature Genetics}, author={Wright, F. A. and Sullivan, P. F. and Brooks, A. I. and Zou, F. and Sun, W. and Xia, K. and Madar, V. and Jansen, R. and Chung, W. I. and Zhou, Y. H. and et al.}, year={2014}, pages={430–437} } @article{hu_tzeng_2014, title={Integrative gene set analysis of multi-platform data with sample heterogeneity}, volume={30}, ISSN={["1460-2059"]}, DOI={10.1093/bioinformatics/btu060}, abstractNote={Abstract Motivation: Gene set analysis is a popular method for large-scale genomic studies. Because genes that have common biological features are analyzed jointly, gene set analysis often achieves better power and generates more biologically informative results. With the advancement of technologies, genomic studies with multi-platform data have become increasingly common. Several strategies have been proposed that integrate genomic data from multiple platforms to perform gene set analysis. To evaluate the performances of existing integrative gene set methods under various scenarios, we conduct a comparative simulation analysis based on The Cancer Genome Atlas breast cancer dataset. Results: We find that existing methods for gene set analysis are less effective when sample heterogeneity exists. To address this issue, we develop three methods for multi-platform genomic data with heterogeneity: two non-parametric methods, multi-platform Mann–Whitney statistics and multi-platform outlier robust T-statistics, and a parametric method, multi-platform likelihood ratio statistics. Using simulations, we show that the proposed multi-platform Mann–Whitney statistics method has higher power for heterogeneous samples and comparable performance for homogeneous samples when compared with the existing methods. Our real data applications to two datasets of The Cancer Genome Atlas also suggest that the proposed methods are able to identify novel pathways that are missed by other strategies. Availability and implementation:  http://www4.stat.ncsu.edu/∼jytzeng/Software/Multiplatform_gene_set_analysis/ Contact:  john.hu@omicsoft.com, jhu7@ncsu.edu Supplementary information:  Supplementary data are available at Bioinformatics online.}, number={11}, journal={BIOINFORMATICS}, author={Hu, Jun and Tzeng, Jung-Ying}, year={2014}, month={Jun}, pages={1501–1507} } @article{wang_zhang_tzeng_2014, title={Pathway-Guided Identification of Gene-Gene Interactions}, volume={78}, ISSN={["1469-1809"]}, DOI={10.1111/ahg.12080}, abstractNote={Assessing gene‐gene interactions (GxG) at the gene level can permit examination of epistasis at biologically functional units with amplified interaction signals from marker‐marker pairs. While current gene‐based GxG methods tend to be designed for two or a few genes, for complex traits, it is often common to have a list of many candidate genes to explore GxG. We propose a regression model with pathway‐guided regularization for detecting interactions among genes. Specifically, we use the principal components to summarize the SNP‐SNP interactions between a gene pair, and use an L1 penalty that incorporates adaptive weights based on biological guidance and trait supervision to identify important main and interaction effects. Our approach aims to combine biological guidance and data adaptiveness, and yields credible findings that may be likely to shed insights in order to formulate biological hypotheses for further molecular studies. The proposed approach can be used to explore the GxG with a list of many candidate genes and is applicable even when sample size is smaller than the number of predictors studied. We evaluate the utility of the proposed method using simulation and real data analysis. The results suggest improved performance over methods not utilizing pathway and trait guidance.}, number={6}, journal={ANNALS OF HUMAN GENETICS}, author={Wang, Xin and Zhang, Daowen and Tzeng, Jung-Ying}, year={2014}, month={Nov}, pages={478–491} } @article{sullivan_daly_ripke_lewis_lin_wray_neale_levinson_breen_byrne_et al._2013, title={A mega-analysis of genome-wide association studies for major depressive disorder}, volume={18}, number={4}, journal={Molecular Psychiatry}, author={Sullivan, P. F. and Daly, M. J. and Ripke, S. and Lewis, C. M. and Lin, D. Y. and Wray, N. R. and Neale, B. and Levinson, D. F. and Breen, G. and Byrne, E. M. and et al.}, year={2013}, pages={497–511} } @article{tzeng_2013, title={Genetic relationship between five psychiatric disorders estimated from genome-wide SNPs}, volume={45}, ISSN={1061-4036 1546-1718}, url={http://dx.doi.org/10.1038/NG.2711}, DOI={10.1038/NG.2711}, abstractNote={Naomi Wray and colleagues report an analysis of genome-wide association data sets from the Psychiatric Genomics Consortium for five psychiatric disorders. They find that common variation explains 17–29% of the variance in liability and provide further support for a shared genetic etiology for these related psychiatric disorders. Most psychiatric disorders are moderately to highly heritable. The degree to which genetic variation is unique to individual disorders or shared across disorders is unclear. To examine shared genetic etiology, we use genome-wide genotype data from the Psychiatric Genomics Consortium (PGC) for cases and controls in schizophrenia, bipolar disorder, major depressive disorder, autism spectrum disorders (ASD) and attention-deficit/hyperactivity disorder (ADHD). We apply univariate and bivariate methods for the estimation of genetic variation within and covariation between disorders. SNPs explained 17–29% of the variance in liability. The genetic correlation calculated using common SNPs was high between schizophrenia and bipolar disorder (0.68 ± 0.04 s.e.), moderate between schizophrenia and major depressive disorder (0.43 ± 0.06 s.e.), bipolar disorder and major depressive disorder (0.47 ± 0.06 s.e.), and ADHD and major depressive disorder (0.32 ± 0.07 s.e.), low between schizophrenia and ASD (0.16 ± 0.06 s.e.) and non-significant for other pairs of disorders as well as between psychiatric disorders and the negative control of Crohn's disease. This empirical evidence of shared genetic etiology for psychiatric disorders can inform nosology and encourages the investigation of common pathophysiologies for related disorders.}, number={9}, journal={Nature Genetics}, publisher={Springer Science and Business Media LLC}, author={Tzeng, Jung-Ying}, year={2013}, month={Aug}, pages={984–994} } @article{lee_ripke_neale_faraone_purcell_perlis_mowry_thapar_goddard_witte_et al._2013, title={Genetic relationship between five psychiatric disorders estimated from genome-wide SNPs}, volume={45}, number={9}, journal={Nature Genetics}, author={Lee, S. H. and Ripke, S. and Neale, B. M. and Faraone, S. V. and Purcell, S. M. and Perlis, R. H. and Mowry, B. J. and Thapar, A. and Goddard, M. E. and Witte, J. S. and et al.}, year={2013}, pages={984-} } @article{maity_sullivan_tzeng_2012, title={Multivariate Phenotype Association Analysis by Marker-Set Kernel Machine Regression}, volume={36}, ISSN={["1098-2272"]}, url={http://www.scopus.com/inward/record.url?eid=2-s2.0-84867539542&partnerID=MN8TOARS}, DOI={10.1002/gepi.21663}, abstractNote={Genetic studies of complex diseases often collect multiple phenotypes relevant to the disorders. As these phenotypes can be correlated and share common genetic mechanisms, jointly analyzing these traits may bring more power to detect genes influencing individual or multiple phenotypes. Given the advancement brought by the multivariate phenotype approaches and the multimarker kernel machine regression, we construct a multivariate regression based on kernel machine to facilitate the joint evaluation of multimarker effects on multiple phenotypes. The kernel machine serves as a powerful dimension‐reduction tool to capture complex effects among markers. The multivariate framework incorporates the potentially correlated multidimensional phenotypic information and accommodates common or different environmental covariates for each trait. We derive the multivariate kernel machine test based on a score‐like statistic, and conduct simulations to evaluate the validity and efficacy of the method. We also study the performance of the commonly adapted strategies for kernel machine analysis on multiple phenotypes, including the multiple univariate kernel machine tests with original phenotypes or with their principal components. Our results suggest that none of these approaches has the uniformly best power, and the optimal test depends on the magnitude of the phenotype correlation and the effect patterns. However, the multivariate test retains to be a reasonable approach when the multiple phenotypes have none or mild correlations, and gives the best power once the correlation becomes stronger or when there exist genes that affect more than one phenotype. We illustrate the utility of the multivariate kernel machine method through the Clinical Antipsychotic Trails of Intervention Effectiveness antibody study.}, number={7}, journal={GENETIC EPIDEMIOLOGY}, publisher={Wiley-Blackwell}, author={Maity, Arnab and Sullivan, Patrick E. and Tzeng, Jung-Ying}, year={2012}, month={Nov}, pages={686–695} } @article{lee_tzeng_huang_hsiao_2011, title={Combining an evolution-guided clustering algorithm and haplotype-based LRT in family association studies}, volume={12}, journal={BMC Genetics}, author={Lee, M. H. and Tzeng, J. Y. and Huang, S. Y. and Hsiao, C. K.}, year={2011} } @article{tzeng_zhang_pongpanich_smith_mccarthy_sale_worrall_hsu_thomas_sullivan_2011, title={Studying Gene and Gene-Environment Effects of Uncommon and Common Variants on Continuous Traits: A Marker-Set Approach Using Gene-Trait Similarity Regression}, volume={89}, ISSN={["1537-6605"]}, DOI={10.1016/j.ajhg.2011.07.007}, abstractNote={Genomic association analyses of complex traits demand statistical tools that are capable of detecting small effects of common and rare variants and modeling complex interaction effects and yet are computationally feasible. In this work, we introduce a similarity-based regression method for assessing the main genetic and interaction effects of a group of markers on quantitative traits. The method uses genetic similarity to aggregate information from multiple polymorphic sites and integrates adaptive weights that depend on allele frequencies to accomodate common and uncommon variants. Collapsing information at the similarity level instead of the genotype level avoids canceling signals that have the opposite etiological effects and is applicable to any class of genetic variants without the need for dichotomizing the allele types. To assess gene-trait associations, we regress trait similarities for pairs of unrelated individuals on their genetic similarities and assess association by using a score test whose limiting distribution is derived in this work. The proposed regression framework allows for covariates, has the capacity to model both main and interaction effects, can be applied to a mixture of different polymorphism types, and is computationally efficient. These features make it an ideal tool for evaluating associations between phenotype and marker sets defined by linkage disequilibrium (LD) blocks, genes, or pathways in whole-genome analysis.}, number={2}, journal={AMERICAN JOURNAL OF HUMAN GENETICS}, author={Tzeng, Jung-Ying and Zhang, Daowen and Pongpanich, Monnat and Smith, Chris and McCarthy, Mark I. and Sale, Michele M. and Worrall, Bradford B. and Hsu, Fang-Chi and Thomas, Duncan C. and Sullivan, Patrick F.}, year={2011}, month={Aug}, pages={277–288} } @article{pongpanich_sullivan_tzeng_2010, title={A quality control algorithm for filtering SNPs in genome-wide association studies}, volume={26}, ISSN={["1460-2059"]}, DOI={10.1093/bioinformatics/btq272}, abstractNote={Abstract Motivation: The quality control (QC) filtering of single nucleotide polymorphisms (SNPs) is an important step in genome-wide association studies to minimize potential false findings. SNP QC commonly uses expert-guided filters based on QC variables [e.g. Hardy–Weinberg equilibrium, missing proportion (MSP) and minor allele frequency (MAF)] to remove SNPs with insufficient genotyping quality. The rationale of the expert filters is sensible and concrete, but its implementation requires arbitrary thresholds and does not jointly consider all QC features. Results: We propose an algorithm that is based on principal component analysis and clustering analysis to identify low-quality SNPs. The method minimizes the use of arbitrary cutoff values, allows a collective consideration of the QC features and provides conditional thresholds contingent on other QC variables (e.g. different MSP thresholds for different MAFs). We apply our method to the seven studies from the Wellcome Trust Case Control Consortium and the major depressive disorder study from the Genetic Association Information Network. We measured the performance of our method compared to the expert filters based on the following criteria: (i) percentage of SNPs excluded due to low quality; (ii) inflation factor of the test statistics (λ); (iii) number of false associations found in the filtered dataset; and (iv) number of true associations missed in the filtered dataset. The results suggest that with the same or fewer SNPs excluded, the proposed algorithm tends to give a similar or lower value of λ, a reduced number of false associations, and retains all true associations. Availability: The algorithm is available at http://www4.stat.ncsu.edu/˜jytzeng/software.php Contact:  jytzeng@stat.ncsu.edu Supplementary information:  Supplementary data are available at Bioinformatics online.}, number={14}, journal={BIOINFORMATICS}, author={Pongpanich, Monnat and Sullivan, Patrick F. and Tzeng, Jung-Ying}, year={2010}, month={Jul}, pages={1731–1737} } @article{koehler_bondell_tzeng_2010, title={Evaluating Haplotype Effects in Case-Control Studies via Penalized-Likelihood Approaches: Prospective or Retrospective Analysis?}, volume={34}, ISSN={["1098-2272"]}, DOI={10.1002/gepi.20545}, abstractNote={AbstractPenalized likelihood methods have become increasingly popular in recent years for evaluating haplotype‐phenotype association in case‐control studies. Although a retrospective likelihood is dictated by the sampling scheme, these penalized methods are typically built on prospective likelihoods due to their modeling simplicity and computational feasibility. It has been well documented that for unpenalized methods, prospective analyses of case‐control data can be valid but less efficient than their retrospective counterparts when testing for association, and result in substantial bias when estimating the haplotype effects. For penalized methods, which combine effect estimation and testing in one step, the impact of using a prospective likelihood is not clear. In this work, we examine the consequences of ignoring the sampling scheme for haplotype‐based penalized likelihood methods. Our results suggest that the impact of prospective analyses depends on (1) the underlying genetic mode and (2) the genetic model adopted in the analysis. When the correct genetic model is used, the difference between the two analyses is negligible for additive and slight for dominant haplotype effects. For recessive haplotype effects, the more appropriate retrospective likelihood clearly outperforms the prospective likelihood. If an additive model is incorrectly used, as the true underlying genetic mode is unknown a priori, both retrospective and prospective penalized methods suffer from a sizeable power loss and increase in bias. The impact of using the incorrect genetic model is much bigger on retrospective analyses than prospective analyses, and results in comparable performances for both methods. An application of these methods to the Genetic Analysis Workshop 15 rheumatoid arthritis data is provided. Genet. Epidemiol. 34:892–911, 2010. © 2010 Wiley‐Liss, Inc.}, number={8}, journal={GENETIC EPIDEMIOLOGY}, author={Koehler, Megan L. and Bondell, Howard D. and Tzeng, Jung-Ying}, year={2010}, month={Dec}, pages={892–911} } @article{tzeng_lu_farmen_liu_sullivan_2010, title={Haplotype-Based Pharmacogenetic Analysis for Longitudinal Quantitative Traits in the Presence of Dropout}, volume={20}, ISSN={["1520-5711"]}, DOI={10.1080/10543400903572787}, abstractNote={We propose a variety of methods based on the generalized estimation equations to address the issues encountered in haplotype-based pharmacogenetic analysis, including analysis of longitudinal data with outcome-dependent dropouts, and evaluation of the high-dimensional haplotype and haplotype–drug interaction effects in an overall manner. We use the inverse probability weights to handle the outcome-dependent dropouts under the missing-at-random assumption, and incorporate the weighted L 1 penalty to select important main and interaction effects with high dimensionality. The proposed methods are easy to implement, computationally efficient, and provide an optimal balance between false positives and false negatives in detecting genetic effects.}, number={2}, journal={JOURNAL OF BIOPHARMACEUTICAL STATISTICS}, author={Tzeng, Jung-Ying and Lu, Wenbin and Farmen, Mark W. and Liu, Youfang and Sullivan, Patrick F.}, year={2010}, pages={334–350} } @article{liu_li_satten_allen_tzeng_2009, title={A Regression-based Association Test for Case-control Studies that Uses Inferred Ancestral Haplotype Similarity}, volume={73}, ISSN={["1469-1809"]}, DOI={10.1111/j.1469-1809.2009.00536.x}, abstractNote={SummaryAssociation methods based on haplotype similarity (HS) can overcome power and stability issues encountered in standard haplotype analyses. Current HS methods can be generally classified into evolutionary and two‐sample approaches. We propose a new regression‐based HS association method for case‐control studies that incorporates covariate information and combines the advantages of the two classes of approaches by using inferred ancestral haplotypes. We first estimate the ancestral haplotypes of case individuals and then, for each individual, an ancestral‐haplotype‐based similarity score is computed by comparing that individual's observed genotype with the estimated ancestral haplotypes. Trait values are then regressed on the similarity scores. Covariates can easily be incorporated into this regression framework. To account for the bias in the raw p‐values due to the use of case data in constructing ancestral haplotypes, as well as to account for variation in ancestral haplotype estimation, a permutation procedure is adopted to obtain empirical p‐values. Compared with the standard haplotype score test and the multilocus T2 test, our method improves power when neither the allele frequency nor linkage disequilibrium between the disease locus and its neighboring SNPs is too low and is comparable in other scenarios. We applied our method to the Genetic Analysis Workshop 15 simulated SNP data and successfully pinpointed a stretch of SNPs that covers the fine‐scale region where the causal locus is located.}, journal={ANNALS OF HUMAN GENETICS}, author={Liu, Youfang and Li, Yi-Ju and Satten, Glen A. and Allen, Andrew S. and Tzeng, Jung-Ying}, year={2009}, month={Sep}, pages={520–526} } @article{tzeng_bondell_2009, title={A comprehensive approach to haplotype-specific analysis by penalized likelihood}, volume={18}, ISSN={1018-4813 1476-5438}, url={http://dx.doi.org/10.1038/ejhg.2009.118}, DOI={10.1038/ejhg.2009.118}, abstractNote={Haplotypes can hold key information to understand the role of candidate genes in disease etiology. However, standard haplotype analysis has yet been able to fully reveal the information retained by haplotypes. In most analysis, haplotype inference focuses on relative effects compared with an arbitrarily chosen baseline haplotype. It does not depict the effect structure unless an additional inference procedure is used in a secondary post hoc analysis, and such analysis tends to be lack of power. In this study, we propose a penalized regression approach to systematically evaluate the pattern and structure of the haplotype effects. By specifying an L1 penalty on the pairwise difference of the haplotype effects, we present a model-based haplotype analysis to detect and to characterize the haplotypic association signals. The proposed method avoids the need to choose a baseline haplotype; it simultaneously carries out the effect estimation and effect comparison of all haplotypes, and outputs the haplotype group structure based on their effect size. Finally, our penalty weights are theoretically designed to balance the likelihood and the penalty term in an appropriate manner. The proposed method can be used as a tool to comprehend candidate regions identified from a genome or chromosomal scan. Simulation studies reveal the better abilities of the proposed method to identify the haplotype effect structure compared with the traditional haplotype association methods, demonstrating the informativeness and powerfulness of the proposed method.}, number={1}, journal={European Journal of Human Genetics}, publisher={Springer Science and Business Media LLC}, author={Tzeng, Jung-Ying and Bondell, Howard D}, year={2009}, month={Jul}, pages={95–103} } @article{sullivan_lin_tzeng_van den oord_perkins_stroup_wagner_lee_wright_zou_et al._2009, title={Erratum: Genomewide association for schizophrenia in the CATIE study: results of stage 1}, volume={14}, ISSN={1359-4184 1476-5578}, url={http://dx.doi.org/10.1038/mp.2008.74}, DOI={10.1038/mp.2008.74}, abstractNote={Correction to: Molecular Psychiatry (2008) 13, 570–584; doi: 10.1038/mp.2008.25 For technical reasons, Supplementary Tables 2, 3 and 4 were not published online. They now appear online at www.nature.com/mp.}, number={12}, journal={Molecular Psychiatry}, publisher={Springer Science and Business Media LLC}, author={Sullivan, P F and Lin, D and Tzeng, J-Y and van den Oord, E and Perkins, D and Stroup, T S and Wagner, M and Lee, S and Wright, F A and Zou, F and et al.}, year={2009}, month={Nov}, pages={1144–1144} } @article{tzeng_zhang_chang_thomas_davidian_2009, title={Gene-Trait Similarity Regression for Multimarker-Based Association Analysis}, volume={65}, ISSN={0006-341X}, url={http://dx.doi.org/10.1111/j.1541-0420.2008.01176.x}, DOI={10.1111/j.1541-0420.2008.01176.x}, abstractNote={Summary We propose a similarity‐based regression method to detect associations between traits and multimarker genotypes. The model regresses similarity in traits for pairs of “unrelated” individuals on their haplotype similarities, and detects the significance by a score test for which the limiting distribution is derived. The proposed method allows for covariates, uses phase‐independent similarity measures to bypass the needs to impute phase information, and is applicable to traits of general types (e.g., quantitative and qualitative traits). We also show that the gene‐trait similarity regression is closely connected with random effects haplotype analysis, although commonly they are considered as separate modeling tools. This connection unites the classic haplotype sharing methods with the variance‐component approaches, which enables direct derivation of analytical properties of the sharing statistics even when the similarity regression model becomes analytically challenging.}, number={3}, journal={Biometrics}, publisher={Wiley}, author={Tzeng, Jung-Ying and Zhang, Daowen and Chang, Sheng-Mao and Thomas, Duncan C. and Davidian, Marie}, year={2009}, month={Feb}, pages={822–832} } @misc{sullivan_geus_willemsen_james_smit_zandbelt_arolt_baune_blackwood_cichon_et al._2009, title={Genome-wide association for major depressive disorder: A possible role for the presynaptic protein piccolo}, volume={14}, number={4}, journal={Molecular Psychiatry}, author={Sullivan, P. F. and Geus, E. J. C. and Willemsen, G. and James, M. R. and Smit, J. H. and Zandbelt, T. and Arolt, V. and Baune, B. T. and Blackwood, D. and Cichon, S. and et al.}, year={2009}, pages={359–375} } @article{sullivan_de geus_willemsen_james_smit_zandbelt_arolt_baune_blackwood_cichon_et al._2008, title={Genome-wide association for major depressive disorder: a possible role for the presynaptic protein piccolo}, volume={14}, ISSN={1359-4184 1476-5578}, url={http://dx.doi.org/10.1038/mp.2008.125}, DOI={10.1038/mp.2008.125}, abstractNote={Major depressive disorder (MDD) is a common complex trait with enormous public health significance. As part of the Genetic Association Information Network initiative of the US Foundation for the National Institutes of Health, we conducted a genome-wide association study of 435 291 single nucleotide polymorphisms (SNPs) genotyped in 1738 MDD cases and 1802 controls selected to be at low liability for MDD. Of the top 200, 11 signals localized to a 167 kb region overlapping the gene piccolo (PCLO, whose protein product localizes to the cytomatrix of the presynaptic active zone and is important in monoaminergic neurotransmission in the brain) with P-values of 7.7 × 10−7 for rs2715148 and 1.2 × 10−6 for rs2522833. We undertook replication of SNPs in this region in five independent samples (6079 MDD independent cases and 5893 controls) but no SNP exceeded the replication significance threshold when all replication samples were analyzed together. However, there was heterogeneity in the replication samples, and secondary analysis of the original sample with the sample of greatest similarity yielded P=6.4 × 10−8 for the nonsynonymous SNP rs2522833 that gives rise to a serine to alanine substitution near a C2 calcium-binding domain of the PCLO protein. With the integrated replication effort, we present a specific hypothesis for further studies.}, number={4}, journal={Molecular Psychiatry}, publisher={Springer Science and Business Media LLC}, author={Sullivan, P F and de Geus, E J C and Willemsen, G and James, M R and Smit, J H and Zandbelt, T and Arolt, V and Baune, B T and Blackwood, D and Cichon, S and et al.}, year={2008}, month={Dec}, pages={359–375} } @article{sullivan_lin_tzeng_van den oord_perkins_stroup_wagner_lee_wright_zou_et al._2008, title={Genomewide association for schizophrenia in the CATIE study: results of stage 1}, volume={13}, ISSN={1359-4184 1476-5578}, url={http://dx.doi.org/10.1038/mp.2008.25}, DOI={10.1038/mp.2008.25}, abstractNote={Little is known for certain about the genetics of schizophrenia. The advent of genomewide association has been widely anticipated as a promising means to identify reproducible DNA sequence variation associated with this important and debilitating disorder. A total of 738 cases with DSM-IV schizophrenia (all participants in the CATIE study) and 733 group-matched controls were genotyped for 492 900 single-nucleotide polymorphisms (SNPs) using the Affymetrix 500K two-chip genotyping platform plus a custom 164K fill-in chip. Following multiple quality control steps for both subjects and SNPs, logistic regression analyses were used to assess the evidence for association of all SNPs with schizophrenia. We identified a number of promising SNPs for follow-up studies, although no SNP or multimarker combination of SNPs achieved genomewide statistical significance. Although a few signals coincided with genomic regions previously implicated in schizophrenia, chance could not be excluded. These data do not provide evidence for the involvement of any genomic region with schizophrenia detectable with moderate sample size. However, a planned genomewide association study for response phenotypes and inclusion of individual phenotype and genotype data from this study in meta-analyses hold promise for eventual identification of susceptibility and protective variants.}, number={6}, journal={Molecular Psychiatry}, publisher={Springer Science and Business Media LLC}, author={Sullivan, P F and Lin, D and Tzeng, J-Y and van den Oord, E and Perkins, D and Stroup, T S and Wagner, M and Lee, S and Wright, F A and Zou, F and et al.}, year={2008}, month={Mar}, pages={570–584} } @article{tzeng_zhang_2007, title={Haplotype-based association analysis via variance-components score test}, volume={81}, ISSN={["0002-9297"]}, DOI={10.1086/521558}, abstractNote={Haplotypes provide a more informative format of polymorphisms for genetic association analysis than do individual single-nucleotide polymorphisms. However, the practical efficacy of haplotype-based association analysis is challenged by a trade-off between the benefits of modeling abundant variation and the cost of the extra degrees of freedom. To reduce the degrees of freedom, several strategies have been considered in the literature. They include (1) clustering evolutionarily close haplotypes, (2) modeling the level of haplotype sharing, and (3) smoothing haplotype effects by introducing a correlation structure for haplotype effects and studying the variance components (VC) for association. Although the first two strategies enjoy a fair extent of power gain, empirical evidence showed that VC methods may exhibit only similar or less power than the standard haplotype regression method, even in cases of many haplotypes. In this study, we report possible reasons that cause the underpowered phenomenon and show how the power of the VC strategy can be improved. We construct a score test based on the restricted maximum likelihood or the marginal likelihood function of the VC and identify its nontypical limiting distribution. Through simulation, we demonstrate the validity of the test and investigate the power performance of the VC approach and that of the standard haplotype regression approach. With suitable choices for the correlation structure, the proposed method can be directly applied to unphased genotypic data. Our method is applicable to a wide-ranging class of models and is computationally efficient and easy to implement. The broad coverage and the fast and easy implementation of this method make the VC strategy an effective tool for haplotype analysis, even in modern genomewide association studies.}, number={5}, journal={AMERICAN JOURNAL OF HUMAN GENETICS}, author={Tzeng, Jung-Ying and Zhang, Daowen}, year={2007}, month={Nov}, pages={927–938} } @article{wen_tzeng_kao_hsiao_2006, title={A two-stage design for multiple testing in large-scale association studies}, volume={51}, ISSN={["1435-232X"]}, DOI={10.1007/s10038-006-0393-6}, abstractNote={Modern association studies often involve a large number of markers and hence may encounter the problem of testing multiple hypotheses. Traditional procedures are usually over-conservative and with low power to detect mild genetic effects. From the design perspective, we propose a two-stage selection procedure to address this concern. Our main principle is to reduce the total number of tests by removing clearly unassociated markers in the first-stage test. Next, conditional on the findings of the first stage, which uses a less stringent nominal level, a more conservative test is conducted in the second stage using the augmented data and the data from the first stage. Previous studies have suggested using independent samples to avoid inflated errors. However, we found that, after accounting for the dependence between these two samples, the true discovery rate increases substantially. In addition, the cost of genotyping can be greatly reduced via this approach. Results from a study of hypertriglyceridemia and simulations suggest the two-stage method has a higher overall true positive rate (TPR) with a controlled overall false positive rate (FPR) when compared with single-stage approaches. We also report the analytical form of its overall FPR, which may be useful in guiding study design to achieve a high TPR while retaining the desired FPR.}, number={6}, journal={JOURNAL OF HUMAN GENETICS}, author={Wen, Shu-Hui and Tzeng, Jung-Ying and Kao, Jau-Tsuen and Hsiao, Chuhsing Kate}, year={2006}, month={Jun}, pages={523–532} } @article{tzeng_roeder_2006, title={Likelihood-based inference on haplotype effects in genetic association studies - Comment}, volume={101}, number={473}, journal={Journal of the American Statistical Association}, author={Tzeng, J. Y. and Roeder, K.}, year={2006}, pages={111–114} } @article{tzeng_wang_kao_hsiao_2006, title={Regression-based association analysis with clustered haplotypes through use of genotypes}, volume={78}, ISSN={["1537-6605"]}, DOI={10.1086/500025}, abstractNote={Haplotype-based association analysis has been recognized as a tool with high resolution and potentially great power for identifying modest etiological effects of genes. However, in practice, its efficacy has not been as successfully reproduced as expected in theory. One primary cause is that such analysis tends to require a large number of parameters to capture the abundant haplotype varieties, and many of those are expended on rare haplotypes for which studies would have insufficient power to detect association even if it existed. To concentrate statistical power on more-relevant inferences, in this study, we developed a regression-based approach using clustered haplotypes to assess haplotype-phenotype association. Specifically, we generalized the probabilistic clustering methods of Tzeng to the generalized linear model (GLM) framework established by Schaid et al. The proposed method uses unphased genotypes and incorporates both phase uncertainty and clustering uncertainty. Its GLM framework allows adjustment of covariates and can model qualitative and quantitative traits. It can also evaluate the overall haplotype association or the individual haplotype effects. We applied the proposed approach to study the association between hypertriglyceridemia and the apolipoprotein A5 gene. Through simulation studies, we assessed the performance of the proposed approach and demonstrate its validity and power in testing for haplotype-trait association.}, number={2}, journal={AMERICAN JOURNAL OF HUMAN GENETICS}, author={Tzeng, JY and Wang, CH and Kao, JT and Hsiao, CK}, year={2006}, month={Feb}, pages={231–242} } @article{tzeng_2005, title={Evolutionary-based grouping of haplotypes in association analysis}, volume={28}, ISSN={["0741-0395"]}, DOI={10.1002/gepi.20063}, abstractNote={AbstractHaplotypes incorporate more information about the underlying polymorphisms than do genotypes for individual SNPs, and are considered as a more informative format of data in association analysis. To model haplotypes requires high degrees of freedom, which could decrease power and limit a model's capacity to incorporate other complex effects, such as gene‐gene interactions. Even within haplotype blocks, high degrees of freedom are still a concern unless one chooses to discard rare haplotypes. To increase the efficiency and power of haplotype analysis, we adapt the evolutionary concepts of cladistic analyses and propose a grouping algorithm to cluster rare haplotypes to the corresponding ancestral haplotypes. The algorithm determines the cluster bases by preserving common haplotypes using a criterion built on the Shannon information content. Each haplotype is then assigned to its appropriate clusters probabilistically according to the cladistic relationship. Through this algorithm, we perform association analysis based on groups of haplotypes. Simulation results indicate power increases for performing tests on the haplotype clusters when compared to tests using original haplotypes or the truncated haplotype distribution. Genet. Epidemiol. © 2005 Wiley‐Liss, Inc.}, number={3}, journal={GENETIC EPIDEMIOLOGY}, author={Tzeng, JY}, year={2005}, month={Apr}, pages={220–231} }