@article{ford_lin_zhou_wright_gombar_sedykh_shah_chiu_rusyn_2024, title={Characterizing PFAS hazards and risks: a human population-based in vitro cardiotoxicity assessment strategy}, volume={18}, ISSN={["1479-7364"]}, DOI={10.1186/s40246-024-00665-x}, abstractNote={Abstract Per- and poly-fluoroalkyl substances (PFAS) are emerging contaminants of concern because of their wide use, persistence, and potential to be hazardous to both humans and the environment. Several PFAS have been designated as substances of concern; however, most PFAS in commerce lack toxicology and exposure data to evaluate their potential hazards and risks. Cardiotoxicity has been identified as a likely human health concern, and cell-based assays are the most sensible approach for screening and prioritization of PFAS. Human-induced pluripotent stem cell (iPSC)-derived cardiomyocytes are a widely used method to test for cardiotoxicity, and recent studies showed that many PFAS affect these cells. Because iPSC-derived cardiomyocytes are available from different donors, they also can be used to quantify human variability in responses to PFAS. The primary objective of this study was to characterize potential human cardiotoxic hazard, risk, and inter-individual variability in responses to PFAS. A total of 56 PFAS from different subclasses were tested in concentration-response using human iPSC-derived cardiomyocytes from 16 donors without known heart disease. Kinetic calcium flux and high-content imaging were used to evaluate biologically-relevant phenotypes such as beat frequency, repolarization, and cytotoxicity. Of the tested PFAS, 46 showed concentration-response effects in at least one phenotype and donor; however, a wide range of sensitivities were observed across donors. Inter-individual variability in the effects could be quantified for 19 PFAS, and risk characterization could be performed for 20 PFAS based on available exposure information. For most tested PFAS, toxicodynamic variability was within a factor of 10 and the margins of exposure were above 100. This study identified PFAS that may pose cardiotoxicity risk and have high inter-individual variability. It also demonstrated the feasibility of using a population-based human in vitro method to quantify population variability and identify cardiotoxicity risks of emerging contaminants.}, number={1}, journal={HUMAN GENOMICS}, author={Ford, Lucie C. and Lin, Hsing-Chieh and Zhou, Yi-Hui and Wright, Fred A. and Gombar, Vijay K. and Sedykh, Alexander and Shah, Ruchir R. and Chiu, Weihsueh A. and Rusyn, Ivan}, year={2024}, month={Sep} } @article{rudra_zhou_nobel_wright_2024, title={Control of false discoveries in grouped hypothesis testing for eQTL data}, volume={25}, ISSN={["1471-2105"]}, DOI={10.1186/s12859-024-05736-3}, abstractNote={Abstract Background Expression quantitative trait locus (eQTL) analysis aims to detect the genetic variants that influence the expression of one or more genes. Gene-level eQTL testing forms a natural grouped-hypothesis testing strategy with clear biological importance. Methods to control family-wise error rate or false discovery rate for group testing have been proposed earlier, but may not be powerful or easily apply to eQTL data, for which certain structured alternatives may be defensible and may enable the researcher to avoid overly conservative approaches. Results In an empirical Bayesian setting, we propose a new method to control the false discovery rate (FDR) for grouped hypotheses. Here, each gene forms a group, with SNPs annotated to the gene corresponding to individual hypotheses. The heterogeneity of effect sizes in different groups is considered by the introduction of a random effects component. Our method, entitled Random Effects model and testing procedure for Group-level FDR control (REG-FDR), assumes a model for alternative hypotheses for the eQTL data and controls the FDR by adaptive thresholding. As a convenient alternate approach, we also propose Z-REG-FDR, an approximate version of REG-FDR, that uses only Z-statistics of association between genotype and expression for each gene-SNP pair. The performance of Z-REG-FDR is evaluated using both simulated and real data. Simulations demonstrate that Z-REG-FDR performs similarly to REG-FDR, but with much improved computational speed. Conclusion Our results demonstrate that the Z-REG-FDR method performs favorably compared to other methods in terms of statistical power and control of FDR. It can be of great practical use for grouped hypothesis testing for eQTL analysis or similar problems in statistical genomics due to its fast computation and ability to be fit using only summary data.}, number={1}, journal={BMC BIOINFORMATICS}, author={Rudra, Pratyaydipta and Zhou, Yi-Hui and Nobel, Andrew and Wright, Fred A.}, year={2024}, month={Apr} } @article{stonebraker_pace_gallins_dang_aksit_faino_gordon_macparland_bamshad_gibson_et al._2024, title={Genetic variation in severe cystic fibrosis liver disease is associated with novel mechanisms for disease pathogenesis}, volume={3}, ISSN={["1527-3350"]}, DOI={10.1097/HEP.0000000000000863}, abstractNote={ Background and Aims: It is not known why severe cystic fibrosis (CF) liver disease (CFLD) with portal hypertension occurs in only ~7% of people with CF (pwCF). We aimed to identify genetic modifiers for severe CFLD to improve understanding of disease mechanisms. Approach and Results: Whole genome sequencing was available in 4,082 pwCF with pancreatic insufficiency (n=516 with severe CFLD; n=3,566 without CFLD). We tested ~15.9 million SNPs for association with severe CFLD versus no-CFLD, using pre-modulator clinical phenotypes including: 1) genetic variant (SERPINA1; Z-allele) previously associated with severe CFLD; 2) candidate SNPs (n=205) associated with non-CF liver diseases; 3) genome-wide association study (GWAS) of common/rare SNPs; 4) transcriptome-wide association (TWAS); and 5) gene-level and pathway analyses. The Z-allele was significantly associated with severe CFLD (p=1.1×10-4). No significant candidate SNPs were identified. GWAS identified genome-wide significant SNPs in 2 loci and 2 suggestive loci. These 4 loci contained genes [significant, PKD1 (p=8.05×10-10) and FNBP1 (p=4.74×10-9); suggestive, DUSP6 (p=1.51×10-7) and ANKUB1 (p=4.69×10-7)] relevant to severe CFLD pathophysiology. TWAS identified 3 genes [CXCR1 (p=1.01×10-6), AAMP (p=1.07×10-6), and TRBV24 (p=1.23×10-5)] involved in hepatic inflammation and innate immunity. Gene-ranked analyses identified pathways enriched in genes linked to multiple liver pathologies. Conclusion: These results identify loci/genes associated with severe CFLD that point to disease mechanisms involving hepatic fibrosis, inflammation and innate immune function, vascular pathology, intracellular signaling, actin cytoskeleton and tight junction integrity, and mechanisms of hepatic steatosis and insulin resistance. These discoveries will facilitate mechanistic studies and the development of therapeutics for severe CFLD. }, journal={HEPATOLOGY}, author={Stonebraker, Jaclyn and Pace, Rhonda and Gallins, Paul and Dang, Hong and Aksit, Melis and Faino, Anna and Gordon, William and Macparland, Sonya and Bamshad, Michael and Gibson, Ronald and et al.}, year={2024}, month={Mar} } @article{ford_lin_tsai_zhou_wright_sedykh_shah_chiu_rusyn_2024, title={Hazard and risk characterization of 56 structurally diverse PFAS using a targeted battery of broad coverage assays using six human cell types}, volume={503}, ISSN={["1879-3185"]}, DOI={10.1016/j.tox.2024.153763}, abstractNote={Per- and poly-fluoroalkyl substances (PFAS) are extensively used in commerce leading to their prevalence in the environment. Due to their chemical stability, PFAS are considered to be persistent and bioaccumulative; they are frequently detected in both the environment and humans. Because of this, PFAS as a class (composed of hundreds to thousands of chemicals) are contaminants of very high concern. Little information is available for the vast majority of PFAS, and regulatory agencies lack safety data to determine whether exposure limits or restrictions are needed. Cell-based assays are a pragmatic approach to inform decision-makers on potential health hazards; therefore, we hypothesized that a targeted battery of human in vitro assays can be used to determine whether there are structure-bioactivity relationships for PFAS, and to characterize potential risks by comparing bioactivity (points of departure) to exposure estimates. We tested 56 PFAS from 8 structure-based subclasses in concentration response (0.1–100 μM) using six human cell types selected from target organs with suggested adverse effects of PFAS – human induced pluripotent stem cell (iPSC)-derived hepatocytes, neurons, and cardiomyocytes, primary human hepatocytes, endothelial and HepG2 cells. While many compounds were without effect; certain PFAS demonstrated cell-specific activity highlighting the necessity of using a compendium of in vitro models to identify potential hazards. No class-specific groupings were evident except for some chain length- and structure-related trends. In addition, margins of exposure (MOE) were derived using empirical and predicted exposure data. Conservative MOE calculations showed that most tested PFAS had a MOE in the 1–100 range; ∼20% of PFAS had MOE<1, providing tiered priorities for further studies. Overall, we show that a compendium of human cell-based models can be used to derive bioactivity estimates for a range of PFAS, enabling comparisons with human biomonitoring data. Furthermore, we emphasize that establishing structure-bioactivity relationships may be challenging for the tested PFAS.}, journal={TOXICOLOGY}, author={Ford, Lucie C. and Lin, Hsing-Chieh and Tsai, Han-Hsuan D. and Zhou, Yi-Hui and Wright, Fred A. and Sedykh, Alexander and Shah, Ruchir R. and Chiu, Weihsueh A. and Rusyn, Ivan}, year={2024}, month={Mar} } @article{broadaway_brotman_rosen_currin_alkhawaja_etheridge_wright_gallins_jima_zhou_et al._2024, title={Liver eQTL meta-analysis illuminates potential molecular mechanisms of cardiometabolic traits}, volume={111}, ISSN={["1537-6605"]}, DOI={10.1016/j.ajhg.2024.07.017}, abstractNote={Understanding the molecular mechanisms of complex traits is essential for developing targeted interventions. We analyzed liver expression quantitative-trait locus (eQTL) meta-analysis data on 1,183 participants to identify conditionally distinct signals. We found 9,013 eQTL signals for 6,564 genes; 23% of eGenes had two signals, and 6% had three or more signals. We then integrated the eQTL results with data from 29 cardiometabolic genome-wide association study (GWAS) traits and identified 1,582 GWAS-eQTL colocalizations for 747 eGenes. Non-primary eQTL signals accounted for 17% of all colocalizations. Isolating signals by conditional analysis prior to coloc resulted in 37% more colocalizations than using marginal eQTL and GWAS data, highlighting the importance of signal isolation. Isolating signals also led to stronger evidence of colocalization: among 343 eQTL-GWAS signal pairs in multi-signal regions, analyses that isolated the signals of interest resulted in higher posterior probability of colocalization for 41% of tests. Leveraging allelic heterogeneity, we predicted causal effects of gene expression on liver traits for four genes. To predict functional variants and regulatory elements, we colocalized eQTL with liver chromatin accessibility QTL (caQTL) and found 391 colocalizations, including 73 with non-primary eQTL signals and 60 eQTL signals that colocalized with both a caQTL and a GWAS signal. Finally, we used publicly available massively parallel reporter assays in HepG2 to highlight 14 eQTL signals that include at least one expression-modulating variant. This multi-faceted approach to unraveling the genetic underpinnings of liver-related traits could lead to therapeutic development.}, number={9}, journal={AMERICAN JOURNAL OF HUMAN GENETICS}, author={Broadaway, K. Alaine and Brotman, Sarah M. and Rosen, Jonathan D. and Currin, Kevin W. and Alkhawaja, Abdalla A. and Etheridge, Amy S. and Wright, Fred and Gallins, Paul and Jima, Dereje and Zhou, Yi-hui and et al.}, year={2024}, month={Sep} } @article{sun_zhou_2024, title={Predicting Microbiome Growth Dynamics under Environmental Perturbations}, url={https://doi.org/10.3390/applmicrobiol4020064}, DOI={10.3390/applmicrobiol4020064}, abstractNote={MicroGrowthPredictor is a model that leverages Long Short-Term Memory (LSTM) networks to predict dynamic changes in microbiome growth in response to varying environmental perturbations. In this article, we present the innovative capabilities of MicroGrowthPredictor, which include the integration of LSTM modeling with a novel confidence interval estimation technique. The LSTM network captures the complex temporal dynamics of microbiome systems, while the novel confidence intervals provide a robust measure of prediction uncertainty. We include two examples—one illustrating the human gut microbiota composition and diversity due to recurrent antibiotic treatment and the other demonstrating the application of MicroGrowthPredictor on an artificial gut dataset. The results demonstrate the enhanced accuracy and reliability of the LSTM-based predictions facilitated by MicroGrowthPredictor. The inclusion of specific metrics, such as the mean square error, validates the model’s predictive performance. Our model holds immense potential for applications in environmental sciences, healthcare, and biotechnology, fostering advancements in microbiome research and analysis. Moreover, it is noteworthy that MicroGrowthPredictor is applicable to real data with small sample sizes and temporal observations under environmental perturbations, thus ensuring its practical utility across various domains.}, journal={Applied Microbiology}, author={Sun, George and Zhou, Yi-Hui}, year={2024}, month={Jun} } @misc{sun_zhou_2023, title={AI in healthcare: navigating opportunities and challenges in digital communication}, volume={5}, ISSN={["2673-253X"]}, url={http://dx.doi.org/10.3389/fdgth.2023.1291132}, DOI={10.3389/fdgth.2023.1291132}, abstractNote={The landscape of healthcare communication is undergoing a profound transformation in the digital age, and at the heart of this evolution are AI-powered chatbots. This mini-review delves into the role of AI chatbots in digital health, providing a detailed exploration of their applications, benefits, challenges, and future prospects. Our focus is on their versatile applications within healthcare, encompassing health information dissemination, appointment scheduling, medication management, remote patient monitoring, and emotional support services. The review underscores the compelling advantages of AI chatbots. However, it also addresses the significant challenges posed by the integration of AI tools into healthcare communication.}, journal={FRONTIERS IN DIGITAL HEALTH}, author={Sun, George and Zhou, Yi-Hui}, year={2023}, month={Dec} } @article{rosenfeld_faino_qu_onchiri_blue_collaco_gordon_szczesniak_zhou_bamshad_et al._2023, title={Association of Pseudomonas aeruginosa infection stage with lung function trajectory in children with cystic fibrosis}, volume={22}, ISSN={["1873-5010"]}, DOI={10.1016/j.jcf.2023.05.004}, abstractNote={Background Pseudomonas aeruginosa (Pa) infection in cystic fibrosis (CF) is characterized in stages: never (prior to first positive culture) to incident (first positive culture) to chronic. The association of Pa infection stage with lung function trajectory is poorly understood and the impact of age on this association has not been examined. We hypothesized that FEV1 decline would be slowest prior to Pa infection, intermediate after incident infection and greatest after chronic Pa infection. Methods Participants in a large US prospective cohort study diagnosed with CF prior to age 3 contributed data through the U.S. CF Patient Registry. Cubic spline linear mixed effects models were used to evaluate the longitudinal association of Pa stage (never, incident, chronic using 4 different definitions) with FEV1 adjusted for relevant covariates. Models contained interaction terms between age and Pa stage. Results 1,264 subjects born 1992–2006 provided a median 9.5 (IQR 0.25 to 15.75) years of follow up through 2017. 89% developed incident Pa; 39–58% developed chronic Pa depending on the definition. Compared to never Pa, incident Pa infection was associated with greater annual FEV1 decline and chronic Pa infection with the greatest FEV1 decline. The most rapid FEV1 decline and strongest association with Pa infection stage was seen in early adolescence (ages 12–15). Conclusions Annual FEV1 decline worsens significantly with each Pa infection stage in children with CF. Our findings suggest that measures to prevent chronic infection, particularly during the high-risk period of early adolescence, could mitigate FEV1 decline and improve survival.}, number={5}, journal={JOURNAL OF CYSTIC FIBROSIS}, author={Rosenfeld, Margaret and Faino, Anna V and Qu, Pingping and Onchiri, Frankline M. and Blue, Elizabeth E. and Collaco, Joseph M. and Gordon, William W. and Szczesniak, Rhonda and Zhou, Yi-Hui and Bamshad, Michael J. and et al.}, year={2023}, month={Sep}, pages={857–863} } @article{zhou_gallins_pace_dang_aksit_blue_buckingham_collaco_faino_gordon_et al._2023, title={Genetic Modifiers of Cystic Fibrosis Lung Disease Severity}, volume={207}, ISSN={["1535-4970"]}, url={https://doi.org/10.1164/rccm.202209-1653OC}, DOI={10.1164/rccm.202209-1653OC}, abstractNote={RATIONALE Lung disease is the major cause of morbidity and mortality in persons with cystic fibrosis (pwCF). Variability in CF lung disease has substantial non-CFTR genetic influence. Identification of genetic modifiers has prognostic and therapeutic importance. OBJECTIVES Identify genetic modifier loci and genes/pathways associated with pulmonary disease severity. METHODS Whole genome sequencing (WGS) data on 4,248 unique pwCF with pancreatic insufficiency (PI) and lung function measures were combined with imputed genotypes from an additional 3,592 PI patients from the US, Canada, and France. This report describes association of ~15.9 million single nucleotide polymorphisms (SNPs), using the quantitative Kulich Normal Residual Mortality Adjusted (KNoRMA) lung disease phenotype in 7,840 pwCF using pre-modulator lung function data. MEASUREMENTS AND MAIN RESULTS Testing included common and rare SNPs, transcriptome-wide association, gene level, and pathway analyses. Pathway analyses identified novel associations with genes that have key roles in organ development, and we hypothesize these genes may relate to dysanapsis and/or variability in lung repair. Results confirmed and extended previous GWAS findings. These WGS data provide finely mapped genetic information to support mechanistic studies. No novel primary associations with common single variants or with rare variants were found. Multi-locus effects at chr5p13 (SLC9A3/CEP72) and chr11p13 (EHF/APIP) were identified. Variant effect size estimates at associated loci were consistently ordered across the cohorts, indicating possible age or birth cohort effects. CONCLUSIONS This pre-modulator genomic, transcriptomic, and pathway association study of 7,840 pwCF will facilitate mechanistic and post-modulator genetic studies and, development of novel therapeutics for CF lung disease.}, number={10}, journal={AMERICAN JOURNAL OF RESPIRATORY AND CRITICAL CARE MEDICINE}, author={Zhou, Yi-Hui and Gallins, Paul J. and Pace, Rhonda G. and Dang, Hong and Aksit, Melis A. and Blue, Elizabeth E. and Buckingham, Kati J. and Collaco, Joseph M. and Faino, Anna V. and Gordon, William W. and et al.}, year={2023}, month={May}, pages={1324–1333} } @article{cordova_klaren_ford_grimm_baker_zhou_wright_rusyn_2023, title={Integrative Chemical-Biological Grouping of Complex High Production Volume Substances from Lower Olefin Manufacturing Streams}, volume={11}, ISSN={["2305-6304"]}, url={https://doi.org/10.3390/toxics11070586}, DOI={10.3390/toxics11070586}, abstractNote={Human cell-based test methods can be used to evaluate potential hazards of mixtures and products of petroleum refining (“unknown or variable composition, complex reaction products, or biological materials” substances, UVCBs). Analyses of bioactivity and detailed chemical characterization of petroleum UVCBs were used separately for grouping these substances; a combination of the approaches has not been undertaken. Therefore, we used a case example of representative high production volume categories of petroleum UVCBs, 25 lower olefin substances from low benzene naphtha and resin oils categories, to determine whether existing manufacturing-based category grouping can be supported. We collected two types of data: nontarget ion mobility spectrometry-mass spectrometry of both neat substances and their organic extracts and in vitro bioactivity of the organic extracts in five human cell types: umbilical vein endothelial cells and induced pluripotent stem cell-derived hepatocytes, endothelial cells, neurons, and cardiomyocytes. We found that while similarity in composition and bioactivity can be observed for some substances, existing categories are largely heterogeneous. Strong relationships between composition and bioactivity were observed, and individual constituents that determine these associations were identified. Overall, this study showed a promising approach that combines chemical composition and bioactivity data to better characterize the variability within manufacturing categories of petroleum UVCBs.}, number={7}, journal={TOXICS}, author={Cordova, Alexandra C. and Klaren, William D. and Ford, Lucie C. and Grimm, Fabian A. and Baker, Erin S. and Zhou, Yi-Hui and Wright, Fred A. and Rusyn, Ivan}, year={2023}, month={Jul} } @article{song_zhou_2023, title={Leveraging Scheme for Cross-Study Microbiome Machine Learning Prediction and Feature Evaluations}, volume={10}, ISSN={["2306-5354"]}, url={https://doi.org/10.3390/bioengineering10020231}, DOI={10.3390/bioengineering10020231}, abstractNote={The microbiota has proved to be one of the critical factors for many diseases, and researchers have been using microbiome data for disease prediction. However, models trained on one independent microbiome study may not be easily applicable to other independent studies due to the high level of variability in microbiome data. In this study, we developed a method for improving the generalizability and interpretability of machine learning models for predicting three different diseases (colorectal cancer, Crohn’s disease, and immunotherapy response) using nine independent microbiome datasets. Our method involves combining a smaller dataset with a larger dataset, and we found that using at least 25% of the target samples in the source data resulted in improved model performance. We determined random forest as our top model and employed feature selection to identify common and important taxa for disease prediction across the different studies. Our results suggest that this leveraging scheme is a promising approach for improving the accuracy and interpretability of machine learning models for predicting diseases based on microbiome data.}, number={2}, journal={BIOENGINEERING-BASEL}, author={Song, Kuncheng and Zhou, Yi-Hui}, year={2023}, month={Feb} } @article{ting_wright_zhou_2023, title={Simultaneous modeling of multivariate heterogeneous responses and heteroskedasticity via a two-stage composite likelihood}, ISSN={["1521-4036"]}, url={https://doi.org/10.1002/bimj.202200029}, DOI={10.1002/bimj.202200029}, abstractNote={AbstractMultivariate heterogeneous responses and heteroskedasticity have attracted increasing attention in recent years. In genome‐wide association studies, effective simultaneous modeling of multiple phenotypes would improve statistical power and interpretability. However, a flexible common modeling system for heterogeneous data types can pose computational difficulties. Here we build upon a previous method for multivariate probit estimation using a two‐stage composite likelihood that exhibits favorable computational time while retaining attractive parameter estimation properties. We extend this approach to incorporate multivariate responses of heterogeneous data types (binary and continuous), and possible heteroskedasticity. Although the approach has wide applications, it would be particularly useful for genomics, precision medicine, or individual biomedical prediction. Using a genomics example, we explore statistical power and confirm that the approach performs well for hypothesis testing and coverage percentages under a wide variety of settings. The approach has the potential to better leverage genomics data and provide interpretable inference for pleiotropy, in which a locus is associated with multiple traits.}, journal={BIOMETRICAL JOURNAL}, author={Ting, Bryan W. W. and Wright, Fred A. and Zhou, Yi-Hui}, year={2023}, month={May} } @article{wang_zhou_2022, title={A Double Penalty Model for Ensemble Learning}, volume={10}, ISSN={["2227-7390"]}, url={https://doi.org/10.3390/math10234532}, DOI={10.3390/math10234532}, abstractNote={Modern statistical learning techniques often include learning ensembles, for which the combination of multiple separate prediction procedures (ensemble components) can improve prediction accuracy. Although ensemble approaches are widely used, work remains to improve our understanding of the theoretical underpinnings of aspects such as identifiability and relative convergence rates of the ensemble components. By considering ensemble learning for two learning ensemble components as a double penalty model, we provide a framework to better understand the relative convergence and identifiability of the two components. In addition, with appropriate conditions the framework provides convergence guarantees for a form of residual stacking when iterating between the two components as a cyclic coordinate ascent procedure. We conduct numerical experiments on three synthetic simulations and two real world datasets to illustrate the performance of our approach, and justify our theory.}, number={23}, journal={MATHEMATICS}, author={Wang, Wenjia and Zhou, Yi-Hui}, year={2022}, month={Dec} } @article{sun_zhou_2022, title={A Machine Learning Pipeline for Mortality Prediction in the ICU}, volume={2}, url={https://doi.org/10.29337/ijdh.44}, DOI={10.29337/ijdh.44}, abstractNote={Mortality risk prediction for patients admitted into the intensive care unit (ICU) is a crucial and challenging task, so that clinicians are able to respond with timely and appropriate clinical intervention. This becomes more urgent under the background of COVID-19 as a global pandemic. In recent years, electronic health records (EHR) have been widely adopted, and have the potential to greatly improve clinical services and diagnostics. However, the large proportion of missing data in EHR poses challenges that may reduce the accuracy of prediction methods. We propose a cohort study that builds a pipeline that extracts ICD-9 codes and laboratory tests from public available electronic ICU databases, and improve the in-hospital mortality prediction accuracy using a combination of neural network missing data imputation approach and decision tree based outcome prediction algorithm. We show the proposed approach achieves a higher area under the ROC curve, ranging from 0.88-0.98, compared with other well-known machine learning methods applied to similar target population. It also offers clinical interpretations through variable selection. Our analysis also shows that mortality prediction for neonates was more challenging than for adults, and that prediction accuracy decreases as patients stayed longer in the ICU.}, number={1}, journal={International Journal of Digital Health}, publisher={IJS Press}, author={Sun, Yang and Zhou, Yi-Hui}, year={2022}, month={May} } @article{ford_jang_chen_zhou_gallins_wright_chiu_rusyn_2022, title={A Population-Based Human In Vitro Approach to Quantify Inter-Individual Variability in Responses to Chemical Mixtures}, volume={10}, ISSN={["2305-6304"]}, url={https://doi.org/10.3390/toxics10080441}, DOI={10.3390/toxics10080441}, abstractNote={Human cell-based population-wide in vitro models have been proposed as a strategy to derive chemical-specific estimates of inter-individual variability; however, the utility of this approach has not yet been tested for cumulative exposures in mixtures. This study aimed to test defined mixtures and their individual components and determine whether adverse effects of the mixtures were likely to be more variable in a population than those of the individual chemicals. The in vitro model comprised 146 human lymphoblastoid cell lines from four diverse subpopulations of European and African descent. Cells were exposed, in concentration–response, to 42 chemicals from diverse classes of environmental pollutants; in addition, eight defined mixtures were prepared from these chemicals using several exposure- or hazard-based scenarios. Points of departure for cytotoxicity were derived using Bayesian concentration–response modeling and population variability was quantified in the form of a toxicodynamic variability factor (TDVF). We found that 28 chemicals and all mixtures exhibited concentration–response cytotoxicity, enabling calculation of the TDVF. The median TDVF across test substances, for both individual chemicals or defined mixtures, ranged from a default assumption (101/2) of toxicodynamic variability in human population to >10. The data also provide a proof of principle for single-variant genome-wide association mapping for toxicity of the chemicals and mixtures, although replication would be necessary due to statistical power limitations with the current sample size. This study demonstrates the feasibility of using a set of human lymphoblastoid cell lines as an in vitro model to quantify the extent of inter-individual variability in hazardous properties of both individual chemicals and mixtures. The data show that population variability of the mixtures is unlikely to exceed that of the most variable component, and that similarity in genome-wide associations among components may be used to accrue additional evidence for grouping of constituents in a mixture for cumulative assessments.}, number={8}, journal={TOXICS}, author={Ford, Lucie C. and Jang, Suji and Chen, Zunwei and Zhou, Yi-Hui and Gallins, Paul J. and Wright, Fred A. and Chiu, Weihsueh A. and Rusyn, Ivan}, year={2022}, month={Aug} } @article{zhou_gallins_etheridge_jima_scholl_wright_innocenti_2022, title={A resource for integrated genomic analysis of the human liver}, volume={12}, ISSN={["2045-2322"]}, url={https://doi.org/10.1038/s41598-022-18506-z}, DOI={10.1038/s41598-022-18506-z}, abstractNote={AbstractIn this study, we generated whole-transcriptome RNA-Seq from n = 192 genotyped liver samples and used these data with existing data from the GTEx Project (RNA-Seq) and previous liver eQTL (microarray) studies to create an enhanced transcriptomic sequence resource in the human liver. Analyses of genotype-expression associations show pronounced enrichment of associations with genes of drug response. The associations are primarily consistent across the two RNA-Seq datasets, with some modest variation, indicating the importance of obtaining multiple datasets to produce a robust resource. We further used an empirical Bayesian model to compare eQTL patterns in liver and an additional 20 GTEx tissues, finding that MHC genes, and especially class II genes, are enriched for liver-specific eQTL patterns. To illustrate the utility of the resource to augment GWAS analysis with small sample sizes, we developed a novel meta-analysis technique to combine several liver eQTL data sources. We also illustrate its application using a transcriptome-enhanced re-analysis of a study of neutropenia in pancreatic cancer patients. The associations of genotype with liver expression, including splice variation and its genetic associations, are made available in a searchable genome browser.}, number={1}, journal={SCIENTIFIC REPORTS}, author={Zhou, Yi-Hui and Gallins, Paul J. and Etheridge, Amy S. and Jima, Dereje and Scholl, Elizabeth and Wright, Fred A. and Innocenti, Federico}, year={2022}, month={Sep} } @article{kingston_stilp_gordon_broome_gogarten_ling_barnard_dugan-perez_ellinor_gabriel_et al._2022, title={Accounting for population structure in genetic studies of cystic fibrosis}, volume={3}, ISSN={["2666-2477"]}, DOI={10.1016/j.xhgg.2022.100117}, abstractNote={CFTR F508del (c.1521_1523delCTT, p.Phe508delPhe) is the most common pathogenic allele underlying cystic fibrosis (CF), and its frequency varies in a geographic cline across Europe. We hypothesized that genetic variation associated with this cline is overrepresented in a large cohort (N > 5,000) of persons with CF who underwent whole-genome sequencing and that this pattern could result in spurious associations between variants correlated with both the F508del genotype and CF-related outcomes. Using principal-component (PC) analyses, we showed that variation in the CFTR region disproportionately contributes to a PC explaining a relatively high proportion of genetic variance. Variation near CFTR was correlated with population structure among persons with CF, and this correlation was driven by a subset of the sample inferred to have European ancestry. We performed genome-wide association studies comparing persons with CF with one versus two copies of the F508del allele; this allowed us to identify genetic variation associated with the F508del allele and to determine that standard PC-adjustment strategies eliminated the significant association signals. Our results suggest that PC adjustment can adequately prevent spurious associations between genetic variants and CF-related traits and are therefore effective tools to control for population structure even when population structure is confounded with disease severity and a common pathogenic variant.}, number={3}, journal={HUMAN GENETICS AND GENOMICS ADVANCES}, author={Kingston, Hanley and Stilp, Adrienne M. and Gordon, William and Broome, Jai and Gogarten, Stephanie M. and Ling, Hua and Barnard, John and Dugan-Perez, Shannon and Ellinor, Patrick T. and Gabriel, Stacey and et al.}, year={2022}, month={Jul} } @article{song_zhou_2022, title={C3NA: correlation and consensus-based cross-taxonomy network analysis for compositional microbial data}, volume={23}, ISSN={["1471-2105"]}, DOI={10.1186/s12859-022-05027-9}, abstractNote={Abstract Background Studying the co-occurrence network structure of microbial samples is one of the critical approaches to understanding the perplexing and delicate relationship between the microbe, host, and diseases. It is also critical to develop a tool for investigating co-occurrence networks and differential abundance analyses to reveal the disease-related taxa–taxa relationship. In addition, it is also necessary to tighten the co-occurrence network into smaller modules to increase the ability for functional annotation and interpretability of  these taxa-taxa relationships.  Also, it is critical to retain the phylogenetic relationship among the taxa to identify differential abundance patterns, which can be used to resolve contradicting functions reported by different studies. Results In this article, we present Correlation and Consensus-based Cross-taxonomy Network Analysis (C3NA), a user-friendly R package for investigating compositional microbial sequencing data to identify and compare co-occurrence patterns across different taxonomic levels. C3NA contains two interactive graphic user interfaces (Shiny applications), one of them dedicated to the comparison between two diagnoses, e.g., disease versus control. We used C3NA to analyze two well-studied diseases, colorectal cancer, and Crohn’s disease. We discovered clusters of study and disease-dependent taxa that overlap with known functional taxa studied by other discovery studies and differential abundance analyses. Conclusion C3NA offers a new microbial data analyses pipeline for refined and enriched taxa–taxa co-occurrence network analyses, and the usability was further expanded via the built-in Shiny applications for interactive investigation. }, number={1}, journal={BMC BIOINFORMATICS}, author={Song, Kuncheng and Zhou, Yi-Hui}, year={2022}, month={Nov} } @article{rosenfeld_faino_onchiri_aksit_blackman_blue_collaco_gordon_pace_raraigh_et al._2022, title={Comparing encounter-based and annualized chronic pseudomonas infection definitions in cystic fibrosis}, volume={21}, ISSN={["1873-5010"]}, DOI={10.1016/j.jcf.2021.07.020}, abstractNote={Chronic Pseudomonas aeruginosa (Pa) infection is associated with increased morbidity and mortality in people with cystic fibrosis (CF). There is no gold standard definition of chronic Pa infection in CF. We compared chronic Pa definitions using encounter-based versus annualized data in the Early Pseudomonas Infection Control (EPIC) Observational study cohort, and subsequently compared annualized chronic Pa definitions across a range of U.S. cohorts spanning decades of CF care. We found that an annualized chronic Pa definition requiring at least 1 Pa+ culture in 3 of 4 consecutive years ("Green 3/4") resulted in chronic Pa metrics similar to established encounter-based modified Leeds criteria definitions, including a similar age at and proportion who fulfilled chronic Pa criteria, and a similar proportion with sustained Pa infection after meeting the chronic Pa definition. The Green 3/4 chronic Pa definition will be valuable for longitudinal analyses in cohorts with limited culture frequency.}, number={1}, journal={JOURNAL OF CYSTIC FIBROSIS}, author={Rosenfeld, Margaret and Faino, Anna V and Onchiri, Frankline and Aksit, Melis A. and Blackman, Scott M. and Blue, Elizabeth E. and Collaco, Joseph M. and Gordon, William W. and Pace, Rhonda G. and Raraigh, Karen S. and et al.}, year={2022}, month={Jan}, pages={40–44} } @article{raraigh_aksit_hetrick_pace_ling_o'neal_blue_zhou_bamshad_blackman_et al._2022, title={Complete CFTR gene sequencing in 5,058 individuals with cystic fibrosis informs variant-specific treatment}, volume={21}, ISSN={["1873-5010"]}, DOI={10.1016/j.jcf.2021.10.011}, abstractNote={Background Cystic fibrosis (CF) is a recessive condition caused by variants in each CF transmembrane conductance regulator (CFTR) allele. Clinically affected individuals without two identified causal variants typically have no further interrogation of CFTR beyond examination of coding regions, but the development of variant-specific CFTR-targeted treatments necessitates complete understanding of CFTR genotype. Methods Whole genome sequences were analyzed on 5,058 individuals with CF. We focused on the full CFTR gene sequence and identified disease-causing variants in three phases: screening for known and structural variants; discovery of novel loss-of-function variants; and investigation of remaining variants. Results All variants identified in the first two phases and coding region variants found in the third phase were interpreted according to CFTR2 or ACMG criteria (n = 371; 16 [4.3%] previously unreported). Full gene sequencing enabled delineation of 18 structural variants (large insertions or deletions), of which two were novel. Additional CFTR variants of uncertain effect were found in 76 F508del homozygotes and in 21 individuals with other combinations of CF-causing variants. Both causative variants were identified in 98.1% (n = 4,960) of subjects, an increase of 2.3 percentage points from the 95.8% (n = 4,847) who had a registry- or chart-reported disease-causing CFTR genotype. Of the remaining 98 individuals, 78 carried one variant that has been associated with CF (CF-causing [n = 70] or resulting in varying clinical consequences n = 8]). Conclusions Complete CFTR gene sequencing in 5,058 individuals with CF identified at least one DNA variant in 99.6% of the cohort that is targetable by current molecular or emerging gene-based therapeutic technologies.}, number={1}, journal={JOURNAL OF CYSTIC FIBROSIS}, author={Raraigh, Karen S. and Aksit, Melis A. and Hetrick, Kurt and Pace, Rhonda G. and Ling, Hua and O'Neal, Wanda and Blue, Elizabeth and Zhou, Yi-Hui and Bamshad, Michael J. and Blackman, Scott M. and et al.}, year={2022}, month={Jan}, pages={463–470} } @article{ting_wright_zhou_2022, title={Fast Multivariate Probit Estimation via a Two-Stage Composite Likelihood}, volume={2}, ISSN={["1867-1772"]}, url={https://doi.org/10.1007/s12561-022-09338-6}, DOI={10.1007/s12561-022-09338-6}, abstractNote={AbstractThe multivariate probit is popular for modeling correlated binary data, with an attractive balance of flexibility and simplicity. However, considerable challenges remain in computation and in devising a clear statistical framework. Interest in the multivariate probit has increased in recent years. Current applications include genomics and precision medicine, where simultaneous modeling of multiple traits may be of interest, and computational efficiency is an important consideration. We propose a fast method for multivariate probit estimation via a two-stage composite likelihood. We explore computational and statistical efficiency, and note that the approach sets the stage for extensions beyond the purely binary setting.}, journal={STATISTICS IN BIOSCIENCES}, author={Ting, Bryan and Wright, Fred and Zhou, Yi-Hui}, year={2022}, month={Feb} } @article{zhou_sun_2022, title={Improve the Colorectal Cancer Diagnosis Using Gut Microbiome Data}, volume={9}, ISSN={["2296-889X"]}, DOI={10.3389/fmolb.2022.921945}, abstractNote={In the United States, colorectal cancer is the second largest cause of cancer death, and accurate early detection and identification of high-risk patients is a high priority. Although fecal screening tests are available, the close relationship between colorectal cancer and the gut microbiome has generated considerable interest. We describe a machine learning method for gut microbiome data to assist in diagnosing colorectal cancer. Our methodology integrates feature engineering, mediation analysis, statistical modeling, and network analysis into a novel unified pipeline. Simulation results illustrate the value of the method in comparison to existing methods. For predicting colorectal cancer in two real datasets, this pipeline showed an 8.7% higher prediction accuracy and 13% higher area under the receiver operator characteristic curve than other published work. Additionally, the approach highlights important colorectal cancer-related taxa for prioritization, such as high levels of Bacteroides fragilis, which can help elucidate disease pathology. Our algorithms and approach can be widely applied for Colorectal cancer prediction using either 16 S rRNA or shotgun metagenomics data.}, journal={FRONTIERS IN MOLECULAR BIOSCIENCES}, author={Zhou, Yi-Hui and Sun, George}, year={2022}, month={Aug} } @article{sun_liu_rosen_huang_pace_dang_gallins_blue_ling_corvol_et al._2022, title={Leveraging TOPMed imputation server and constructing a cohort-specific imputation reference panel to enhance genotype imputation among cystic fibrosis patients}, volume={3}, ISSN={["2666-2477"]}, DOI={10.1016/j.xhgg.2022.100090}, abstractNote={Cystic fibrosis (CF) is a severe genetic disorder that can cause multiple comorbidities affecting the lungs, the pancreas, the luminal digestive system and beyond. In our previous genome-wide association studies (GWAS), we genotyped approximately 8,000 CF samples using a mixture of different genotyping platforms. More recently, the Cystic Fibrosis Genome Project (CFGP) performed deep (approximately 30×) whole genome sequencing (WGS) of 5,095 samples to better understand the genetic mechanisms underlying clinical heterogeneity among patients with CF. For mixtures of GWAS array and WGS data, genotype imputation has proven effective in increasing effective sample size. Therefore, we first performed imputation for the approximately 8,000 CF samples with GWAS array genotype using the Trans-Omics for Precision Medicine (TOPMed) freeze 8 reference panel. Our results demonstrate that TOPMed can provide high-quality imputation for patients with CF, boosting genomic coverage from approximately 0.3–4.2 million genotyped markers to approximately 11–43 million well-imputed markers, and significantly improving polygenic risk score (PRS) prediction accuracy. Furthermore, we built a CF-specific CFGP reference panel based on WGS data of patients with CF. We demonstrate that despite having approximately 3% the sample size of TOPMed, our CFGP reference panel can still outperform TOPMed when imputing some CF disease-causing variants, likely owing to allele and haplotype differences between patients with CF and general populations. We anticipate our imputed data for 4,656 samples without WGS data will benefit our subsequent genetic association studies, and the CFGP reference panel built from CF WGS samples will benefit other investigators studying CF.}, number={2}, journal={HUMAN GENETICS AND GENOMICS ADVANCES}, author={Sun, Quan and Liu, Weifang and Rosen, Jonathan D. and Huang, Le and Pace, Rhonda G. and Dang, Hong and Gallins, Paul J. and Blue, Elizabeth E. and Ling, Hua and Corvol, Harriet and et al.}, year={2022}, month={Apr} } @article{sun_yang_rosen_jiang_chen_liu_wen_raffield_pace_zhou_et al._2022, title={MagicalRsq: Machine-learning-based genotype imputation quality calibration}, volume={109}, ISSN={["1537-6605"]}, DOI={10.1016/j.ajhg.2022.09.009}, abstractNote={Whole-genome sequencing (WGS) is the gold standard for fully characterizing genetic variation but is still prohibitively expensive for large samples. To reduce costs, many studies sequence only a subset of individuals or genomic regions, and genotype imputation is used to infer genotypes for the remaining individuals or regions without sequencing data. However, not all variants can be well imputed, and the current state-of-the-art imputation quality metric, denoted as standard Rsq, is poorly calibrated for lower-frequency variants. Here, we propose MagicalRsq, a machine-learning-based method that integrates variant-level imputation and population genetics statistics, to provide a better calibrated imputation quality metric. Leveraging WGS data from the Cystic Fibrosis Genome Project (CFGP), and whole-exome sequence data from UK BioBank (UKB), we performed comprehensive experiments to evaluate the performance of MagicalRsq compared to standard Rsq for partially sequenced studies. We found that MagicalRsq aligns better with true R2 than standard Rsq in almost every situation evaluated, for both European and African ancestry samples. For example, when applying models trained from 1,992 CFGP sequenced samples to an independent 3,103 samples with no sequencing but TOPMed imputation from array genotypes, MagicalRsq, compared to standard Rsq, achieved net gains of 1.4 million rare, 117k low-frequency, and 18k common variants, where net gains were gained numbers of correctly distinguished variants by MagicalRsq over standard Rsq. MagicalRsq can serve as an improved post-imputation quality metric and will benefit downstream analysis by better distinguishing well-imputed variants from those poorly imputed. MagicalRsq is freely available on GitHub.}, number={11}, journal={AMERICAN JOURNAL OF HUMAN GENETICS}, author={Sun, Quan and Yang, Yingxi and Rosen, Jonathan D. and Jiang, Min-Zhi and Chen, Jiawen and Liu, Weifang and Wen, Jia and Raffield, Laura M. and Pace, Rhonda G. and Zhou, Yi-Hui and et al.}, year={2022}, month={Nov}, pages={1986–1997} } @article{aksit_ling_pace_raraigh_onchiri_faino_pagel_pugh_stilp_sun_et al._2022, title={Pleiotropic modifiers of age-related diabetes and neonatal intestinal obstruction in cystic fibrosis}, volume={109}, ISSN={["1537-6605"]}, DOI={10.1016/j.ajhg.2022.09.004}, abstractNote={Individuals with cystic fibrosis (CF) develop complications of the gastrointestinal tract influenced by genetic variants outside of CFTR. Cystic fibrosis-related diabetes (CFRD) is a distinct form of diabetes with a variable age of onset that occurs frequently in individuals with CF, while meconium ileus (MI) is a severe neonatal intestinal obstruction affecting ∼20% of newborns with CF. CFRD and MI are slightly correlated traits with previous evidence of overlap in their genetic architectures. To better understand the genetic commonality between CFRD and MI, we used whole-genome-sequencing data from the CF Genome Project to perform genome-wide association. These analyses revealed variants at 11 loci (6 not previously identified) that associated with MI and at 12 loci (5 not previously identified) that associated with CFRD. Of these, variants at SLC26A9, CEBPB, and PRSS1 associated with both traits; variants at SLC26A9 and CEBPB increased risk for both traits, while variants at PRSS1, the higher-risk alleles for CFRD, conferred lower risk for MI. Furthermore, common and rare variants within the SLC26A9 locus associated with MI only or CFRD only. As expected, different loci modify risk of CFRD and MI; however, a subset exhibit pleiotropic effects indicating etiologic and mechanistic overlap between these two otherwise distinct complications of CF.}, number={10}, journal={AMERICAN JOURNAL OF HUMAN GENETICS}, author={Aksit, Melis A. and Ling, Hua and Pace, Rhonda G. and Raraigh, Karen S. and Onchiri, Frankline and Faino, Anna V. and Pagel, Kymberleigh and Pugh, Elizabeth and Stilp, Adrienne M. and Sun, Quan and et al.}, year={2022}, month={Oct}, pages={1894–1908} } @article{wang_zhou_2021, title={Eigenvector-based sparse canonical correlation analysis: Fast computation for estimation of multiple canonical vectors}, volume={185}, ISSN={["0047-259X"]}, url={https://doi.org/10.1016/j.jmva.2021.104781}, DOI={10.1016/j.jmva.2021.104781}, abstractNote={Classical canonical correlation analysis (CCA) requires matrices to be low dimensional, i.e. the number of features cannot exceed the sample size. Recent developments in CCA have mainly focused on the high-dimensional setting, where the number of features in both matrices under analysis greatly exceeds the sample size. These approaches impose penalties in the optimization problems that are needed to be solve iteratively, and estimate multiple canonical vectors sequentially. In this work, we provide an explicit link between sparse multiple regression with sparse canonical correlation analysis, and an efficient algorithm that can estimate multiple canonical pairs simultaneously rather than sequentially. Furthermore, the algorithm naturally allows parallel computing. These properties make the algorithm much efficient. We provide theoretical results on the consistency of canonical pairs. The algorithm and theoretical development are based on solving an eigenvectors problem, which significantly differentiate our method with existing methods. Simulation results support the improved performance of the proposed approach. We apply eigenvector-based CCA to analysis of the GTEx thyroid histology images, analysis of SNPs and RNA-seq gene expression data, and a microbiome study. The real data analysis also shows improved performance compared to traditional sparse CCA.}, journal={JOURNAL OF MULTIVARIATE ANALYSIS}, publisher={Elsevier BV}, author={Wang, Wenjia and Zhou, Yi-Hui}, year={2021}, month={Sep} } @article{zhou_saghapour_2021, title={ImputEHR: A Visualization Tool of Imputation for the Prediction of Biomedical Data}, volume={12}, ISSN={["1664-8021"]}, DOI={10.3389/fgene.2021.691274}, abstractNote={Electronic health records (EHRs) have been widely adopted in recent years, but often include a high proportion of missing data, which can create difficulties in implementing machine learning and other tools of personalized medicine. Completed datasets are preferred for a number of analysis methods, and successful imputation of missing EHR data can improve interpretation and increase our power to predict health outcomes. However, use of the most popular imputation methods mainly require scripting skills, and are implemented using various packages and syntax. Thus, the implementation of a full suite of methods is generally out of reach to all except experienced data scientists. Moreover, imputation is often considered as a separate exercise from exploratory data analysis, but should be considered as art of the data exploration process. We have created a new graphical tool, ImputEHR, that is based on a Python base and allows implementation of a range of simple and sophisticated (e.g., gradient-boosted tree-based and neural network) data imputation approaches. In addition to imputation, the tool enables data exploration for informed decision-making, as well as implementing machine learning prediction tools for response data selected by the user. Although the approach works for any missing data problem, the tool is primarily motivated by problems encountered for EHR and other biomedical data. We illustrate the tool using multiple real datasets, providing performance measures of imputation and downstream predictive analysis.}, journal={FRONTIERS IN GENETICS}, author={Zhou, Yi-Hui and Saghapour, Ehsan}, year={2021}, month={Jul} } @article{chen_jang_kaihatu_zhou_wright_chiu_rusyn_2021, title={Potential Human Health Hazard of Post-Hurricane Harvey Sediments in Galveston Bay and Houston Ship Channel: A Case Study of Using In Vitro Bioactivity Data to Inform Risk Management Decisions}, volume={18}, ISSN={["1660-4601"]}, url={https://doi.org/10.3390/ijerph182413378}, DOI={10.3390/ijerph182413378}, abstractNote={Natural and anthropogenic disasters may be associated with redistribution of chemical contaminants in the environment; however, current methods for assessing hazards and risks of complex mixtures are not suitable for disaster response. This study investigated the suitability of in vitro toxicity testing methods as a rapid means of identifying areas of potential human health concern. We used sediment samples (n = 46) from Galveston Bay and the Houston Ship Channel (GB/HSC) areas after hurricane Harvey, a disaster event that led to broad redistribution of chemically-contaminated sediments, including deposition of the sediment on shore due to flooding. Samples were extracted with cyclohexane and dimethyl sulfoxide and screened in a compendium of human primary or induced pluripotent stem cell (iPSC)-derived cell lines from different tissues (hepatocytes, neuronal, cardiomyocytes, and endothelial) to test for concentration-dependent effects on various functional and cytotoxicity phenotypes (n = 34). Bioactivity data were used to map areas of potential concern and the results compared to the data on concentrations of polycyclic aromatic hydrocarbons (PAHs) in the same samples. We found that setting remediation goals based on reducing bioactivity is protective of both “known” risks associated with PAHs and “unknown” risks associated with bioactivity, but the converse was not true for remediation based on PAH risks alone. Overall, we found that in vitro bioactivity can be used as a comprehensive indicator of potential hazards and is an example of a new approach method (NAM) to inform risk management decisions on site cleanup.}, number={24}, journal={INTERNATIONAL JOURNAL OF ENVIRONMENTAL RESEARCH AND PUBLIC HEALTH}, author={Chen, Zunwei and Jang, Suji and Kaihatu, James M. and Zhou, Yi-Hui and Wright, Fred A. and Chiu, Weihsueh A. and Rusyn, Ivan}, year={2021}, month={Dec} } @article{luo_chen_blanchette_zhou_wright_baker_chiu_rusyn_2021, title={Relationships between constituents of energy drinks and beating parameters in human induced pluripotent stem cell (iPSC)-Derived cardiomyocytes}, volume={149}, ISSN={["1873-6351"]}, url={https://doi.org/10.1016/j.fct.2021.111979}, DOI={10.1016/j.fct.2021.111979}, abstractNote={Consumption of energy drinks has been associated with adverse cardiovascular effects; however, little is known about the ingredients that may contribute to these effects. We therefore characterized the chemical profiles and in vitro effects of energy drinks and their ingredients on human induced pluripotent stem cell (iPSC)-derived cardiomyocytes, and identified the putative active ingredients using a multivariate prediction model. Energy drinks from 17 widely-available over-the-counter brands were evaluated in this study. The concentrations of six common ingredients (caffeine, taurine, riboflavin, pantothenic acid, adenine, and L-methionine) were quantified by coupling liquid chromatography with a triple quadrupole mass spectrometer for the acquisition of LC-MS/MS spectra. In addition, untargeted analyses for each beverage were performed with a platform combining LC, ion mobility spectrometry and mass spectrometry (LC-IMS-MS) measurements. Approximately 300 features were observed across samples in the untargeted studies, and of these ~100 were identified. In vitro effects of energy drinks and some of their ingredients were then tested in iPSC-derived cardiomyocytes. Data on the beat rate (positive and negative chronotropy), ion channel function (QT prolongation), and cytotoxicity were collected in a dilution series. We found that some of the energy drinks elicited adverse effects on the cardiomyocytes with the most common being an increase in the beat rate, while QT prolongation was also observed at the lowest concentrations. Finally, concentration addition modeling using quantitative data from the 6 common ingredients and multivariate prediction modeling was used to determine potential ingredients responsible for the adverse effects on the cardiomyocytes. These analyses suggested theophylline, adenine, and azelate as possibly contributing to the in vitro effects of energy drinks on QT prolongation in cardiomyocytes.}, journal={FOOD AND CHEMICAL TOXICOLOGY}, author={Luo, Yu-Syuan and Chen, Zunwei and Blanchette, Alexander D. and Zhou, Yi-Hui and Wright, Fred A. and Baker, Erin S. and Chiu, Weihsueh A. and Rusyn, Ivan}, year={2021}, month={Mar} } @misc{marvel_house_wheeler_song_zhou_wright_chiu_rusyn_motsinger-reif_reif_2021, title={The COVID-19 Pandemic Vulnerability Index (PVI) Dashboard: Monitoring County-Level Vulnerability Using Visualization, Statistical Modeling, and Machine Learning}, volume={129}, ISSN={["1552-9924"]}, url={http://www.scopus.com/inward/record.url?eid=2-s2.0-85099420902&partnerID=MN8TOARS}, DOI={10.1289/EHP8690}, abstractNote={Vol. 129, No. 1 Research LetterOpen AccessThe COVID-19 Pandemic Vulnerability Index (PVI) Dashboard: Monitoring County-Level Vulnerability Using Visualization, Statistical Modeling, and Machine Learning Skylar W. Marvel, John S. House, Matthew Wheeler, Kuncheng Song, Yi-Hui Zhou, Fred A. Wright, Weihsueh A. Chiu, Ivan Rusyn, Alison Motsinger-Reif, and David M. Reif Skylar W. Marvel Bioinformatics Research Center, Department of Biological Sciences, North Carolina State University (NCSU), Raleigh, North Carolina, USA , John S. House Biostatistics and Computational Biology Branch, National Institute of Environmental Health Sciences, National Institutes of Health, Department of Health and Human Services, Research Triangle Park, North Carolina, USA , Matthew Wheeler Biostatistics and Computational Biology Branch, National Institute of Environmental Health Sciences, National Institutes of Health, Department of Health and Human Services, Research Triangle Park, North Carolina, USA , Kuncheng Song Bioinformatics Research Center, Department of Biological Sciences, North Carolina State University (NCSU), Raleigh, North Carolina, USA , Yi-Hui Zhou Bioinformatics Research Center, Department of Biological Sciences, North Carolina State University (NCSU), Raleigh, North Carolina, USA , Fred A. Wright Bioinformatics Research Center, Department of Biological Sciences, North Carolina State University (NCSU), Raleigh, North Carolina, USA Department of Statistics, NCSU, Raleigh, North Carolina, USA , Weihsueh A. Chiu Veterinary Integrative Biosciences, College of Veterinary Medicine and Biomedical Sciences, Texas A&M University, College Station, Texas, USA , Ivan Rusyn Veterinary Integrative Biosciences, College of Veterinary Medicine and Biomedical Sciences, Texas A&M University, College Station, Texas, USA , Alison Motsinger-Reif Address correspondence to Alison Motsinger-Reif, 111 T.W. Alexander Dr., Rall Building, Research Triangle Park, NC 27709 USA. Email: E-mail Address: [email protected], or David M. Reif, Box 7566, 1 Lampe Dr., Raleigh NC 27695 USA. Email: E-mail Address: [email protected] Biostatistics and Computational Biology Branch, National Institute of Environmental Health Sciences, National Institutes of Health, Department of Health and Human Services, Research Triangle Park, North Carolina, USA , and David M. Reif Address correspondence to Alison Motsinger-Reif, 111 T.W. Alexander Dr., Rall Building, Research Triangle Park, NC 27709 USA. Email: E-mail Address: [email protected], or David M. Reif, Box 7566, 1 Lampe Dr., Raleigh NC 27695 USA. Email: E-mail Address: [email protected] Bioinformatics Research Center, Department of Biological Sciences, North Carolina State University (NCSU), Raleigh, North Carolina, USA Published:5 January 2021CID: 017701https://doi.org/10.1289/EHP8690AboutSectionsPDF ToolsDownload CitationsTrack Citations ShareShare onFacebookTwitterLinked InReddit IntroductionExpert groups have coalesced around a roadmap to address the current COVID-19 pandemic centered on social distancing, monitoring case counts and health care capacity, and, eventually, moving to pharmaceutical interventions. However, responsibility for navigating the pandemic response falls largely on state and local officials. To make equitable decisions on allocating resources, caring for vulnerable subpopulations, and implementing local- and state-level interventions, access to current pandemic data and key vulnerabilities at the community level are essential (National Academies of Sciences, Engineering, and Medicine 2020). Although numerous predictive models and interactive monitoring applications have been developed using pandemic-related data sets (Wynants et al. 2020), their capacity to aid in dynamic, community-level decision-making is limited. We developed the interactive COVID-19 Pandemic Vulnerability Index (PVI) Dashboard ( https://covid19pvi.niehs.nih.gov/) to address this need by presenting a visual synthesis of dynamic information at the county level to monitor disease trajectories, communicate local vulnerabilities, forecast key outcomes, and guide informed responses (Figure 1).Figure 1. COVID-19 PVI Dashboard. Dashboard screenshot displaying PVI profiles atop a choropleth map layer indicating overall COVID-19 PVI rank. The PVI Scorecard and associated data for Clarendon County, South Carolina, has been selected. The scorecard summarizes the overall PVI score and rank compared with all 3,142 U.S. counties on each indicator slice. The scrollable score distributions at left compare the selected county PVI to the distributions of overall and slice-wise scores across the United States. The panels below the map are populated with county-specific information on observed trends in cases and deaths, cumulative numbers for the county, historical timelines (for cumulative cases, cumulative deaths, PVI, and PVI rank), daily case and death counts for the most recent 14-d period, and a 14-d forecast of predicted cases and deaths. The information displayed for both observed COVID-19 data and PVI layers is scrollable back through March 2020. Documentation of additional features and usage, including advanced options (accessible via the collapsed menu at the upper left), is provided in a Quick Start Guide (linked at the upper right corner). Note: Pop, population; PVI, Pandemic Vulnerability Index.MethodsThe current PVI model integrates multiple data streams into an overall score derived from 12 key indicators—including well-established, general vulnerability factors for public health, plus emerging factors relevant to the pandemic—distributed across four domains: current infection rates, baseline population concentration, current interventions, and health and environmental vulnerabilities. The PVI profiles translate numerical results into visual representations, with each vulnerability factor represented as a component slice of a radar chart (Figure 2). The PVI profile for each county is calculated using the Toxicological Prioritization Index (ToxPi) framework for data integration within a geospatial context (Marvel et al. 2018; Bhandari et al. 2020). Data sources in the current model (version 11.2.1) include the Social Vulnerability Index (SVI) of the Centers for Disease Control and Prevention (CDC) for emergency response and hazard mitigation planning (Horney et al. 2017), testing rates from the COVID Tracking Project (Atlantic Monthly Group 2020), social distancing metrics from mobile device data ( https://www.unacast.com/covid19/social-distancing-scoreboard), and dynamic measures of disease spread and case numbers ( https://usafacts.org/issues/coronavirus/). Methodological details concerning the integration of data streams—plus the complete, daily time series of all source data since February 2020 and resultant PVI scores—are maintained on the public Github project page (COVID19PVI 2020). Over this period, the PVI has been strongly associated with key vulnerability-related outcome metrics (by rank-correlation), with updates of its performance assessment posted with model updates alongside data at the Github project page (COVID19PVI 2020).Figure 2. Translation of data into COVID-19 PVI profiles. Information from all 3,142 U.S. counties is translated into PVI slices. The illustration shows how air pollution data (average density of fine particulate matterPM2.5 per county) are compared for two example counties. The county with the higher relative measurement (County Y) has a longer air pollution slice than the county with a lower measurement (County X). This procedure is repeated for all slices, resulting in an integrated, overall PVI profile. Note: pop, population; PVI, Pandemic Vulnerability Index.In addition to the PVI itself—which is a summary, human-centric visualization of relative vulnerability drivers—the dashboard is supported by rigorous statistical modeling of the underlying data to enable quantitative analysis and provide short-term, local predictions of cases and deaths [complete methodological details are maintained at the Github project page (COVID19PVI 2020)]. Generalized linear models of cumulative outcome data indicated that, after population size, the most significant predictors were the proportion of Black residents, mean fine particulate matter [particulate matter less than or equal to 2.5 micrometers≤2.5μm in diameter (fine particulate matterPM2.5)], percentage of population with insurance coverage (which was positively associated), and proportion of Hispanic residents. The local predictions of cases and deaths (see the "Predictions" panel in Figure 1) are updated daily using a Bayesian spatiotemporal random-effects model to build forecasts up to 2 weeks out.DiscussionThe PVI Dashboard supports decision-making and dynamic monitoring in several ways. The display can be tailored to add or remove layers of information, filtered by region (e.g., all counties within a state) or clustered by profile shape similarity. The timelines for both PVI models and observed COVID-19 outcomes facilitate tracking the impact of interventions and directing local resource allocations. The "Predictions" panel (Figure 1) connects these historical numbers to local forecasts of cases and deaths. By communicating an integrated concept of vulnerability that considers both dynamic (infection rate and interventions) and static (community population and health care characteristics) drivers, the interactive dashboard can promote buy-in from diverse audiences, which is necessary for effective public health interventions. This messaging can assist in addressing known racial disparities in COVID-19 case and death rates (Tan et al. 2020) or populations, and the PVI Dashboard is part of the "Unique Populations" tab of the CDC's COVID-19 Data Tracker ( https://covid.cdc.gov/covid-data-tracker). By filtering the display to highlight vulnerability drivers within an overall score context, the dashboard can inform targeted interventions for specific localities.Unfortunately, the pandemic endures across the United States, with broad disparities based on the local environment (Tan et al. 2020). We present the PVI Dashboard as a dynamic container for contextualizing these disparities. It is a modular tool that will evolve to incorporate new data sources and analytics as they emerge (e.g., concurrent flu infections, school and business reopening statistics, heterogeneous public health practices). This flexibility positions it well as a resource for integrated prioritization of eventual vaccine distribution and monitoring its local impact. The PVI Dashboard can empower local and state officials to take informed action to combat the pandemic by communicating interactive, visual profiles of vulnerability atop an underlying statistical framework that enables the comparison of counties and the evaluation of the PVI's component data.AcknowledgmentsWe thank the information technology and web services staff at the National Institute of Environmental Health Sciences (NIEHS)/National Institutes of Health (NIH) for their help and support, as well as J.K. Cetina and D.J. Reif for their useful technical input and advice. This work was supported by NIEHS/NIH grants (P42 ES027704, P30 ES029067, P42 ES031009, and P30 ES025128) and NIEHS/NIH intramural funds (Z ES103352-01).ReferencesAtlantic Monthly Group.2020. The COVID Tracking Project. https://covidtracking.com/ [accessed 15 November 2020]. Google ScholarBhandari S, Lewis PGT, Craft E, Marvel SW, Reif DM, Chiu WA. 2020. HGBEnviroScreen: enabling community action through data integration in the Houston–Galveston–Brazoria region. Int J Environ Res Public Health 17(4):1130, PMID: 32053902, 10.3390/ijerph17041130. Crossref, Medline, Google ScholarCOVID19PVI.2020. COVID19PVI/data. https://github.com/COVID19PVI/data [accessed 15 November 2020]. Google ScholarHorney J, Nguyen M, Salvesen D, Dwyer C, Cooper J, Berke P. 2017. Assessing the quality of rural hazard mitigation plans in the southeastern United States. J Plan Educ Res 37(1):56–65, 10.1177/0739456X16628605. Crossref, Google ScholarMarvel SW, To K, Grimm FA, Wright FA, Rusyn I, Reif DM. 2018. ToxPi Graphical User Interface 2.0: dynamic exploration, visualization, and sharing of integrated data models. BMC Bioinformatics 19(1):80, PMID: 29506467, 10.1186/s12859-018-2089-2. Crossref, Medline, Google ScholarNational Academies of Sciences, Engineering, and Medicine.2020. Framework for Equitable Allocation of COVID-19 Vaccine. Gayle H, Foege W, Brown L, Kahn B, eds. Washington, DC: National Academies Press. Google ScholarTan TQ, Kullar R, Swartz TH, Mathew TA, Piggott DA, Berthaud V. 2020. Location matters: geographic disparities and impact of coronavirus disease 2019. J Infect Dis 222(12):1951–1954, PMID: 32942299, 10.1093/infdis/jiaa583. Crossref, Medline, Google ScholarWynants L, Van Calster B, Collins GS, Riley RD, Heinze G, Schuit E, et al.2020. Prediction models for diagnosis and prognosis of covid-19: systematic review and critical appraisal. BMJ 369:m1328, PMID: 32265220, 10.1136/bmj.m1328. Crossref, Medline, Google ScholarThe authors declare they have no actual or potential competing financial interests.FiguresReferencesRelatedDetails Vol. 129, No. 1 January 2021Metrics About Article Metrics Publication History Manuscript received20 November 2020Manuscript revised14 December 2020Manuscript accepted21 December 2020Originally published5 January 2021 Financial disclosuresPDF download License information EHP is an open-access journal published with support from the National Institute of Environmental Health Sciences, National Institutes of Health. All content is public domain unless otherwise noted. Note to readers with disabilities EHP strives to ensure that all journal content is accessible to all readers. However, some figures and Supplemental Material published in EHP articles may not conform to 508 standards due to the complexity of the information being presented. If you need assistance accessing journal content, please contact [email protected]. Our staff will work with you to assess and meet your accessibility needs within 3 working days.}, number={1}, journal={ENVIRONMENTAL HEALTH PERSPECTIVES}, author={Marvel, Skylar W. and House, John S. and Wheeler, Matthew and Song, Kuncheng and Zhou, Yi-Hui and Wright, Fred A. and Chiu, Weihsueh A. and Rusyn, Ivan and Motsinger-Reif, Alison and Reif, David M.}, year={2021}, month={Jan} } @article{gallins_saghapour_zhou_2020, title={Exploring the Limits of Combined Image/'omics Analysis for Non-cancer Histological Phenotypes}, volume={11}, ISSN={["1664-8021"]}, DOI={10.3389/fgene.2020.555886}, abstractNote={The last several years have witnessed an explosion of methods and applications for combining image data with 'omics data, and for prediction of clinical phenotypes. Much of this research has focused on cancer histology, for which genetic perturbations are large, and the signal to noise ratio is high. Related research on chronic, complex diseases is limited by tissue sample availability, lower genomic signal strength, and the less extreme and tissue-specific nature of intermediate histological phenotypes. Data from the GTEx Consortium provides a unique opportunity to investigate the connections among phenotypic histological variation, imaging data, and 'omics profiling, from multiple tissue-specific phenotypes at the sub-clinical level. Investigating histological designations in multiple tissues, we survey the evidence for genomic association and prediction of histology, and use the results to test the limits of prediction accuracy using machine learning methods applied to the imaging data, genomics data, and their combination. We find that expression data has similar or superior accuracy for pathology prediction as our use of imaging data, despite the fact that pathological determination is made from the images themselves. A variety of machine learning methods have similar performance, while network embedding methods offer at best limited improvements. These observations hold across a range of tissues and predictor types. The results are supportive of the use of genomic measurements for prediction, and in using the same target tissue in which pathological phenotyping has been performed. Although this last finding is sensible, to our knowledge our study is the first to demonstrate this fact empirically. Even while prediction accuracy remains a challenge, the results show clear evidence of pathway and tissue-specific biology.}, journal={FRONTIERS IN GENETICS}, author={Gallins, Paul and Saghapour, Ehsan and Zhou, Yi-Hui}, year={2020}, month={Oct} } @article{song_wright_zhou_2020, title={Systematic Comparisons for Composition Profiles, Taxonomic Levels, and Machine Learning Methods for Microbiome-Based Disease Prediction}, volume={7}, ISSN={["2296-889X"]}, DOI={10.3389/fmolb.2020.610845}, abstractNote={Microbiome composition profiles generated from 16S rRNA sequencing have been extensively studied for their usefulness in phenotype trait prediction, including for complex diseases such as diabetes and obesity. These microbiome compositions have typically been quantified in the form of Operational Taxonomic Unit (OTU) count matrices. However, alternate approaches such as Amplicon Sequence Variants (ASV) have been used, as well as the direct use of k-mer sequence counts. The overall effect of these different types of predictors when used in concert with various machine learning methods has been difficult to assess, due to varied combinations described in the literature. Here we provide an in-depth investigation of more than 1,000 combinations of these three clustering/counting methods, in combination with varied choices for normalization and filtering, grouping at various taxonomic levels, and the use of more than ten commonly used machine learning methods for phenotype prediction. The use of short k-mers, which have computational advantages and conceptual simplicity, is shown to be effective as a source for microbiome-based prediction. Among machine-learning approaches, tree-based methods show consistent, though modest, advantages in prediction accuracy. We describe the various advantages and disadvantages of combinations in analysis approaches, and provide general observations to serve as a useful guide for future trait-prediction explorations using microbiome data.}, journal={FRONTIERS IN MOLECULAR BIOSCIENCES}, author={Song, Kuncheng and Wright, Fred A. and Zhou, Yi-Hui}, year={2020}, month={Dec} } @article{etheridge_gallins_jima_broadaway_ratain_schuetz_schadt_schroder_molony_zhou_et al._2020, title={A New Liver Expression Quantitative Trait Locus Map From 1,183 Individuals Provides Evidence for Novel Expression Quantitative Trait Loci of Drug Response, Metabolic, and Sex-Biased Phenotypes}, volume={107}, ISSN={["1532-6535"]}, DOI={10.1002/cpt.1751}, abstractNote={Expression quantitative trait locus (eQTL) studies in human liver are crucial for elucidating how genetic variation influences variability in disease risk and therapeutic outcomes and may help guide strategies to obtain maximal efficacy and safety of clinical interventions. Associations between expression microarray and genome‐wide genotype data from four human liver eQTL studies (n = 1,183) were analyzed. More than 2.3 million cis‐eQTLs for 15,668 genes were identified. When eQTLs were filtered against a list of 1,496 drug response genes, 187,829 cis‐eQTLs for 1,191 genes were identified. Additionally, 1,683 sex‐biased cis‐eQTLs were identified, as well as 49 and 73 cis‐eQTLs that colocalized with genome‐wide association study signals for blood metabolite or lipid levels, respectively. Translational relevance of these results is evidenced by linking DPYD eQTLs to differences in safety of chemotherapy, linking the sex‐biased regulation of PCSK9 expression to anti‐lipid therapy, and identifying the G‐protein coupled receptor GPR180 as a novel drug target for hypertriglyceridemia.}, number={6}, journal={CLINICAL PHARMACOLOGY & THERAPEUTICS}, author={Etheridge, Amy S. and Gallins, Paul J. and Jima, Dereje and Broadaway, K. Alaine and Ratain, Mark J. and Schuetz, Erin and Schadt, Eric and Schroder, Adrian and Molony, Cliona and Zhou, Yihui and et al.}, year={2020}, month={Jun}, pages={1383–1393} } @misc{zhou_gallins_2019, title={A Review and Tutorial of Machine Learning Methods for Microbiome Host Trait Prediction}, volume={10}, ISSN={["1664-8021"]}, DOI={10.3389/fgene.2019.00579}, abstractNote={With the growing importance of microbiome research, there is increasing evidence that host variation in microbial communities is associated with overall host health. Advancement in genetic sequencing methods for microbiomes has coincided with improvements in machine learning, with important implications for disease risk prediction in humans. One aspect specific to microbiome prediction is the use of taxonomy-informed feature selection. In this review for non-experts, we explore the most commonly used machine learning methods, and evaluate their prediction accuracy as applied to microbiome host trait prediction. Methods are described at an introductory level, and R/Python code for the analyses is provided.}, journal={FRONTIERS IN GENETICS}, author={Zhou, Yi-Hui and Gallins, Paul}, year={2019}, month={Jun} } @article{zhou_2019, title={A note on cyclic shift permutation testing for large eigenvalues}, volume={8}, url={https://doi.org/10.1002/sta4.257}, DOI={10.1002/sta4.257}, abstractNote={Recent publications have described the problem of testing for the “significance” of large sample (empirical) matrix eigenvalues in the presence of modest variation of underlying true eigenvalues. This modest variation often can be ascribed to endemic dependence in one matrix dimension (e.g., rows), whereas the null hypothesis concerns the other dimension (columns). The need for such testing frequently arises in genomics, time‐series analysis, and a variety of other fields. However, the tools available for testing are underdeveloped, with statistical properties that may be sensitive to the true eigenvalues. The purpose of this note is to point the reader to this emerging literature and to suggest that the tool of cyclic shift permutation may be well‐suited to the problem.}, number={1}, journal={Stat}, publisher={Wiley}, author={Zhou, Yi‐Hui}, year={2019}, month={Jan} } @article{zhou_gallins_wright_2019, title={Marker-Trait Complete Analysis}, volume={11}, url={https://doi.org/10.1101/836494}, DOI={10.1101/836494}, abstractNote={1AbstractA recurring problem in genomics involves testing association of one or more traits of interest to multiple genomic features. Feature-trait squared correlations r2 are commonly-used statistics, sensitive to trend associations. It is often of interest to perform testing across collections {r2} over markers and/or traits using both maxima and sums. However, both trait-trait correlations and marker-marker correlations may be strong and must be considered. The primary tools for multiple testing suffer from various shortcomings, including p-value inaccuracies due to asymptotic methods that may not be applicable. Moreover, there is a lack of general tools for fast screening and follow-up of regions of interest.To address these difficulties, we propose the MTCA approach, for Marker-Trait Complete Analysis. MTCA encompasses a large number of existing approaches, and provides accurate p-values over markers and traits for maxima and sums of r2 statistics. MTCA uses the conditional inference implicit in permutation as a motivational frame-work, but provides an option for fast screening with two novel tools: (i) a multivariate-normal approximation for the max statistic, and (ii) the concept of eigenvalue-conditional moments for the sum statistic. We provide examples for gene-based association testing of a continuous phenotype and cis-eQTL analysis, but MTCA can be applied in a much wider variety of settings and platforms.}, publisher={Cold Spring Harbor Laboratory}, author={Zhou, Yi-Hui and Gallins, Paul and Wright, Fred}, year={2019}, month={Nov} } @article{zhou_2019, title={Set‐based differential covariance testing for genomics}, volume={8}, url={https://doi.org/10.1002/sta4.235}, DOI={10.1002/sta4.235}, abstractNote={The problem of detecting the changes in covariance for a single pair of genomic features has been studied in some detail but may be limited in importance or general applicability. For testing equality of covariance matrices of a set of features, many methods have been limited to the two‐sample problem and involve varying assumptions on the number of features p versus the sample size n. More general covariance regression approaches are appealing but have been insufficiently structured to provide interpretable testing. To address these deficiencies, we propose a simple uniform framework to test association of covariance matrices with an experimental variable, whether discrete or continuous. We describe four different summary statistics, to ensure power and flexibility under various alternatives, including a new “connectivity” statistic that is sensitive to the changes in overall covariance magnitude. For continuous experimental variables, a natural individual “risk score” is associated with several of the statistics. We establish asymptotic results applicable to both continuous and discrete responses, with relatively mild conditions and allowing for situations where p>n. We also show that the proposed statistics are permutationally equivalent to some existing methods in the two‐sample special case. We demonstrate the power and utility of our approaches via simulation and analysis of real data. The R package CorDiff is published on R CRAN.}, number={1}, journal={Stat}, publisher={Wiley}, author={Zhou, Yi‐Hui}, year={2019}, month={Jan} } @article{frayling_beaumont_jones_yaghootkar_tuke_ruth_casanova_west_locke_sharp_et al._2018, title={A common allele in FGF21 associated with sugar intake is associated with body shape, lower total body-fat percentage, and higher blood pressure}, volume={23}, number={2}, journal={Cell reports}, author={Frayling, T. M. and Beaumont, R. N. and Jones, S. E. and Yaghootkar, H. and Tuke, M. A. and Ruth, K. S. and Casanova, F. and West, B. and Locke, J. and Sharp, S. and et al.}, year={2018}, pages={327–336} } @article{frayling_beaumont_jones_yaghootkar_tuke_ruth_casanova_west_locke_sharp_et al._2018, title={A common allele in FGF21 associated with sugar intake is associated with body shape, lower total body-fat percentage, and higher blood pressure}, volume={23}, number={2}, journal={Cell reports}, publisher={Elsevier}, author={Frayling, Timothy M and Beaumont, Robin N and Jones, Samuel E and Yaghootkar, Hanieh and Tuke, Marcus A and Ruth, Katherine S and Casanova, Francesco and West, Ben and Locke, Jonathan and Sharp, Seth and et al.}, year={2018}, pages={327–336} } @article{aoshima_shen_shen_yata_zhou_marron_2018, title={A survey of high dimension low sample size asymptotics}, volume={60}, ISSN={["1467-842X"]}, DOI={10.1111/anzs.12212}, abstractNote={SummaryPeter Hall's work illuminated many aspects of statistical thought, some of which are very well known including the bootstrap and smoothing. However, he also explored many other lesser known aspects of mathematical statistics. This is a survey of one of those areas, initiated by a seminal paper in 2005, on high dimension low sample size asymptotics. An interesting characteristic of that first paper, and of many of the following papers, is that they contain deep and insightful concepts which are frequently surprising and counter‐intuitive, yet have mathematical underpinnings which tend to be direct and not difficult to prove.}, number={1}, journal={AUSTRALIAN & NEW ZEALAND JOURNAL OF STATISTICS}, author={Aoshima, Makoto and Shen, Dan and Shen, Haipeng and Yata, Kazuyoshi and Zhou, Yi-Hui and Marron, J. S.}, year={2018}, month={Mar}, pages={4–19} } @article{hu_gallins_zhou_2018, title={A zero-inflated beta-binomial model for microbiome data analysis}, volume={7}, ISSN={["2049-1573"]}, url={https://doi.org/10.1002/sta4.185}, DOI={10.1002/sta4.185}, abstractNote={The Microbiome is increasingly recognized as an important aspect of the health of host species, involved in many biological pathways and processes and potentially useful as health biomarkers. Taking advantage of high‐throughput sequencing technologies, modern bacterial microbiome studies are metagenomic, interrogating thousands of taxa simultaneously. Several data analysis frameworks have been proposed for microbiome sequence read count data and for determining the most significant features. However, there is still room for improvement. We introduce a zero‐inflated beta‐binomial to model the distribution of microbiome count data and to determine association with a continuous or categorical phenotype of interest. The approach can exploit the mean‐variance relationship to improve power and adjust for covariates. The proposed method is a mixture model with two components: (i) a zero model accounting for excess zeros and (ii) a count model to capture the remaining component by beta‐binomial regression, allowing for overdispersion effects. Simulation studies show that our proposed method effectively controls type I error and has higher power than competing methods to detect taxa associated with phenotype. An R package ZIBBSeqDiscovery is available on R CRAN. Copyright © 2018 John Wiley & Sons, Ltd.}, number={1}, journal={STAT}, publisher={Wiley}, author={Hu, Tao and Gallins, Paul and Zhou, Yi-Hui}, year={2018} } @article{polineni_dang_gallins_jones_pace_stonebraker_commander_krenicky_zhou_corvol_et al._2018, title={Airway mucosal host defense is key to genomic regulation of cystic fibrosis lung disease severity}, volume={197}, number={1}, journal={American Journal of Respiratory and Critical Care Medicine}, author={Polineni, D. and Dang, H. and Gallins, P. J. and Jones, L. C. and Pace, R. G. and Stonebraker, J. R. and Commander, L. A. and Krenicky, J. E. and Zhou, Y. H. and Corvol, H. and et al.}, year={2018}, pages={79–93} } @article{polineni_dang_gallins_jones_pace_stonebraker_commander_krenicky_zhou_corvol_et al._2018, title={Airway mucosal host defense is key to genomic regulation of cystic fibrosis lung disease severity}, volume={197}, number={1}, journal={American journal of respiratory and critical care medicine}, publisher={American Thoracic Society}, author={Polineni, Deepika and Dang, Hong and Gallins, Paul J and Jones, Lisa C and Pace, Rhonda G and Stonebraker, Jaclyn R and Commander, Leah A and Krenicky, Jeanne E and Zhou, Yi-Hui and Corvol, Harriet and et al.}, year={2018}, pages={79–93} } @article{house_grimm_jima_zhou_rusyn_wright_2017, title={A Pipeline for High-Throughput Concentration Response Modeling of Gene Expression for Toxicogenomics}, volume={8}, journal={Frontiers in genetics}, publisher={Frontiers}, author={House, John S and Grimm, Fabian A and Jima, Dereje D and Zhou, Yi-Hui and Rusyn, Ivan and Wright, Fred A}, year={2017}, pages={168} } @article{zhou_brooks_wang_2017, title={A Two-Stage Hidden Markov Model Design for Biomarker Detection, with Application to Microbiome Research}, volume={10}, ISSN={1867-1764 1867-1772}, url={http://dx.doi.org/10.1007/S12561-017-9187-Y}, DOI={10.1007/S12561-017-9187-Y}, abstractNote={It has been recognized that for appropriately ordered data, hidden Markov models (HMM) with local false discovery rate (FDR) control can increase the power to detect significant associations. For many high-throughput technologies, the cost still limits their application. Two-stage designs are attractive, in which a set of interesting features or biomarkers is identified in a first stage, and then followed up in a second stage. However, to our knowledge no two-stage FDR control with HMMs has been developed. In this paper, we study an efficient HMM-FDR based two-stage design, using a simple integrated analysis procedure across the stages. Numeric studies show its excellent performance when compared to available methods. A power analysis method is also proposed. We use examples from microbiome data to illustrate the methods.}, number={1}, journal={Statistics in Biosciences}, publisher={Springer Science and Business Media LLC}, author={Zhou, Yi-Hui and Brooks, Paul and Wang, Xiaoshan}, year={2017}, month={Feb}, pages={41–58} } @article{rudra_zhou_wright_2017, title={A procedure to detect general association based on concentration of ranks}, volume={6}, number={1}, journal={Stat}, author={Rudra, Pratyaydipta and Zhou, Yihui and Wright, Fred A}, year={2017}, pages={88–101} } @article{brooks_buck_chen_diao_edwards_fettweis_huzurbazar_rakitin_satten_smirnova_et al._2017, title={Changes in vaginal community state types reflect major shifts in the microbiome}, volume={28}, ISSN={1651-2235}, url={http://dx.doi.org/10.1080/16512235.2017.1303265}, DOI={10.1080/16512235.2017.1303265}, abstractNote={ABSTRACT Background: Recent studies of various human microbiome habitats have revealed thousands of bacterial species and the existence of large variation in communities of microorganisms in the same habitats across individual human subjects. Previous efforts to summarize this diversity, notably in the human gut and vagina, have categorized microbiome profiles by clustering them into community state types (CSTs). The functional relevance of specific CSTs has not been established. Objective: We investigate whether CSTs can be used to assess dynamics in the microbiome. Design: We conduct a re-analysis of five sequencing-based microbiome surveys derived from vaginal samples with repeated measures. Results: We observe that detection of a CST transition is largely insensitive to choices in methods for normalization or clustering. We find that healthy subjects persist in a CST for two to three weeks or more on average, while those with evidence of dysbiosis tend to change more often. Changes in CST can be gradual or occur over less than one day. Upcoming CST changes and switches to high-risk CSTs can be predicted with high accuracy in certain scenarios. Finally, we observe that presence of Gardnerella vaginalis is a strong predictor of an upcoming CST change. Conclusion: Overall, our results show that the CST concept is useful for studying microbiome dynamics.}, number={1}, journal={Microbial Ecology in Health and Disease}, publisher={Informa UK Limited}, author={Brooks, J. Paul and Buck, Gregory A. and Chen, Guanhua and Diao, Liyang and Edwards, David J. and Fettweis, Jennifer M. and Huzurbazar, Snehalata and Rakitin, Alexander and Satten, Glen A. and Smirnova, Ekaterina and et al.}, year={2017}, month={Jan}, pages={1303265} } @article{brooks_buck_chen_diao_edwards_fettweis_huzurbazar_rakitin_satten_smirnova_et al._2017, title={Changes in vaginal community state types reflect major shifts in the microbiome}, volume={28}, number={1}, journal={Microbial ecology in health and disease}, publisher={Taylor & Francis}, author={Brooks, J Paul and Buck, Gregory A and Chen, Guanhua and Diao, Liyang and Edwards, David J and Fettweis, Jennifer M and Huzurbazar, Snehalata and Rakitin, Alexander and Satten, Glen A and Smirnova, Ekaterina and et al.}, year={2017}, pages={1303265} } @article{zhou_marron_wright_2018, title={Computation of ancestry scores with mixed families and unrelated individuals}, volume={74}, ISSN={["1541-0420"]}, url={https://doi.org/10.1111/biom.12708}, DOI={10.1111/biom.12708}, abstractNote={Summary The issue of robustness to family relationships in computing genotype ancestry scores such as eigenvector projections has received increased attention in genetic association, and is particularly challenging when sets of both unrelated individuals and closely related family members are included. The current standard is to compute loadings (left singular vectors) using unrelated individuals and to compute projected scores for remaining family members. However, projected ancestry scores from this approach suffer from shrinkage toward zero. We consider two main novel strategies: (i) matrix substitution based on decomposition of a target family-orthogonalized covariance matrix, and (ii) using family-averaged data to obtain loadings. We illustrate the performance via simulations, including resampling from 1000 Genomes Project data, and analysis of a cystic fibrosis dataset. The matrix substitution approach has similar performance to the current standard, but is simple and uses only a genotype covariance matrix, while the family-average method shows superior performance. Our approaches are accompanied by novel ancillary approaches that provide considerable insight, including individual-specific eigenvalue scree plots.}, number={1}, journal={BIOMETRICS}, publisher={Wiley}, author={Zhou, Yi-Hui and Marron, James S. and Wright, Fred A.}, year={2018}, month={Mar}, pages={155–164} } @article{tan_li_shanmugam_piskol_kohler_young_liu_zhang_ramaswami_ariyoshi_et al._2017, title={Dynamic landscape and regulation of RNA editing in mammals}, volume={550}, ISSN={0028-0836 1476-4687}, url={http://dx.doi.org/10.1038/NATURE24041}, DOI={10.1038/NATURE24041}, abstractNote={Using the GTEx data and others, a comprehensive analysis of adenosine-to-inosine RNA editing in mammals is presented; targets of the various ADAR enzymes are identified, as are several potential regulators of editing, such as AIMP2. The GTEx (Genotype-Tissue Expression) Consortium has established a reference catalogue and associated tissue biobank for gene-expression levels across individuals for diverse tissues of the human body, with a broad sampling of normal, non-diseased human tissues from postmortem donors. The consortium now presents the deepest survey of gene expression across multiple tissues and individuals to date, encompassing 7,051 samples from 449 donors across 44 human tissues. Barbara Engelhardt and colleagues characterize the relationship between genetic variation and gene expression, and find that most genes are regulated by genetic variation near to the affected gene. In accompanying GTEx studies, Alexis Battle, Stephen Montgomery and colleagues examine the effect of rare genetic variation on gene expression across human tissues, Daniel MacArthur and colleagues systematically survey the landscape of X chromosome inactivation in human tissues, and Jin Billy Li and colleagues provide a comprehensive cross-species analysis of adenosine-to-inosine RNA editing in mammals. In an accompanying News & Views, Michelle Ward and Yoav Gilad put the latest results in context and discuss how these findings are helping to crack the regulatory code of the human genome. Adenosine-to-inosine (A-to-I) RNA editing is a conserved post-transcriptional mechanism mediated by ADAR enzymes that diversifies the transcriptome by altering selected nucleotides in RNA molecules1. Although many editing sites have recently been discovered2,3,4,5,6,7, the extent to which most sites are edited and how the editing is regulated in different biological contexts are not fully understood8,9,10. Here we report dynamic spatiotemporal patterns and new regulators of RNA editing, discovered through an extensive profiling of A-to-I RNA editing in 8,551 human samples (representing 53 body sites from 552 individuals) from the Genotype-Tissue Expression (GTEx) project and in hundreds of other primate and mouse samples. We show that editing levels in non-repetitive coding regions vary more between tissues than editing levels in repetitive regions. Globally, ADAR1 is the primary editor of repetitive sites and ADAR2 is the primary editor of non-repetitive coding sites, whereas the catalytically inactive ADAR3 predominantly acts as an inhibitor of editing. Cross-species analysis of RNA editing in several tissues revealed that species, rather than tissue type, is the primary determinant of editing levels, suggesting stronger cis-directed regulation of RNA editing for most sites, although the small set of conserved coding sites is under stronger trans-regulation. In addition, we curated an extensive set of ADAR1 and ADAR2 targets and showed that many editing sites display distinct tissue-specific regulation by the ADAR enzymes in vivo. Further analysis of the GTEx data revealed several potential regulators of editing, such as AIMP2, which reduces editing in muscles by enhancing the degradation of the ADAR proteins. Collectively, our work provides insights into the complex cis- and trans-regulation of A-to-I editing.}, number={7675}, journal={Nature}, publisher={Springer Science and Business Media LLC}, author={Tan, Meng How and Li, Qin and Shanmugam, Raghuvaran and Piskol, Robert and Kohler, Jennefer and Young, Amy N. and Liu, Kaiwen Ivy and Zhang, Rui and Ramaswami, Gokul and Ariyoshi, Kentaro and et al.}, year={2017}, month={Oct}, pages={249–254} } @article{zhou_cichocki_soldatow_scholl_gallins_jima_yoo_chiu_wright_rusyn_2017, title={Editor's Highlight: Comparative Dose-Response Analysis of Liver and Kidney Transcriptomic Effects of Trichloroethylene and Tetrachloroethylene in B6C3F1 Mouse}, volume={160}, ISSN={["1096-0929"]}, DOI={10.1093/toxsci/kfx165}, abstractNote={Trichloroethylene (TCE) and tetrachloroethylene (PCE) are ubiquitous environmental contaminants and occupational health hazards. Recent health assessments of these agents identified several critical data gaps, including lack of comparative analysis of their effects. This study examined liver and kidney effects of TCE and PCE in a dose-response study design. Equimolar doses of TCE (24, 80, 240, and 800 mg/kg) or PCE (30, 100, 300, and 1000 mg/kg) were administered by gavage in aqueous vehicle to male B6C3F1/J mice. Tissues were collected 24 h after exposure. Trichloroacetic acid (TCA), a major oxidative metabolite of both compounds, was measured and RNA sequencing was performed. PCE had a stronger effect on liver and kidney transcriptomes, as well as greater concentrations of TCA. Most dose-responsive pathways were common among chemicals/tissues, with the strongest effect on peroxisomal β-oxidation. Effects on liver and kidney mitochondria-related pathways were notably unique to PCE. We performed dose-response modeling of the transcriptomic data and compared the resulting points of departure (PODs) to those for apical endpoints derived from long-term studies with these chemicals in rats, mice, and humans, converting to human equivalent doses using tissue-specific dosimetry models. Tissue-specific acute transcriptional effects of TCE and PCE occurred at human equivalent doses comparable to those for apical effects. These data are relevant for human health assessments of TCE and PCE as they provide data for dose-response analysis of the toxicity mechanisms. Additionally, they provide further evidence that transcriptomic data can be useful surrogates for in vivo PODs, especially when toxicokinetic differences are taken into account.}, number={1}, journal={TOXICOLOGICAL SCIENCES}, publisher={Oxford University Press}, author={Zhou, Yi-Hui and Cichocki, Joseph A. and Soldatow, Valerie Y. and Scholl, Elizabeth H. and Gallins, Paul J. and Jima, Dereje and Yoo, Hong-Sik and Chiu, Weihsueh A. and Wright, Fred A. and Rusyn, Ivan}, year={2017}, month={Nov}, pages={95–110} } @article{zhou_marron_wright_2018, title={Eigenvalue Significance Testing for Genetic Association}, url={https://doi.org/10.1111/biom.12767}, DOI={10.1111/biom.12767}, abstractNote={Summary Genotype eigenvectors are widely used as covariates for control of spurious stratification in genetic association. Significance testing for the accompanying eigenvalues has typically been based on a standard Tracy–Widom limiting distribution for the largest eigenvalue, derived under white-noise assumptions. It is known that even modest local correlation among markers inflates the largest eigenvalues, even in the absence of true stratification. In addition, a few sample eigenvalues may be extreme, creating further complications in accurate testing. We explore several methods to identify appropriate null eigenvalue thresholds, while remaining sensitive to eigenvalues corresponding to population stratification. We introduce a novel block permutation approach, designed to produce an appropriate null eigenvalue distribution by eliminating long-range genomic correlation while preserving local correlation. We also propose a fast approach based on eigenvalue distribution modeling, using a simple fit criterion and the general Marčenko–Pastur equation under a simple discrete eigenvalue model. Block permutation and the model-based approach work well for pure simulations and for data resampled from the 1000 Genomes project. In contrast, we find that the standard approach of computing an “effective” number of markers does not perform well. The performance of the methods is also demonstrated for a motivating example from the International Cystic Fibrosis Consortium.}, journal={Biometrics}, author={Zhou, Yi-Hui and Marron, J. S. and Wright, Fred A.}, year={2018}, month={Jun} } @article{palowitch_shabalin_zhou_nobel_wright_2018, title={Estimation of cis-eQTL effect sizes using a log of linear model}, volume={74}, ISSN={["1541-0420"]}, url={https://doi.org/10.1111/biom.12810}, DOI={10.1111/biom.12810}, abstractNote={Summary The study of expression Quantitative Trait Loci (eQTL) is an important problem in genomics and biomedicine. While detection (testing) of eQTL associations has been widely studied, less work has been devoted to the estimation of eQTL effect size. To reduce false positives, detection methods frequently rely on linear modeling of rank-based normalized or log-transformed gene expression data. Unfortunately, these approaches do not correspond to the simplest model of eQTL action, and thus yield estimates of eQTL association that can be uninterpretable and inaccurate. In this article, we propose a new, log-of-linear model for eQTL action, termed ACME, that captures allelic contributions to cis-acting eQTLs in an additive fashion, yielding effect size estimates that correspond to a biologically coherent model of cis-eQTLs. We describe a non-linear least-squares algorithm to fit the model by maximum likelihood, and obtain corresponding p-values. We perform careful investigation of the model using a combination of simulated data and data from the Genotype Tissue Expression (GTEx) project. Our results reveal little evidence for dominance effects, a parsimonious result that accords with a simple biological model for allele-specific expression and supports use of the ACME model. We show that Type-I error is well-controlled under our approach in a realistic setting, so that rank-based normalizations are unnecessary. Furthermore, we show that such normalizations can be detrimental to power and estimation accuracy under the proposed model. We then show, through effect size analyses of whole-genome cis-eQTLs in the GTEx data, that using standard normalizations instead of ACME noticeably affects the ranking and sign of estimates.}, number={2}, journal={BIOMETRICS}, publisher={Wiley}, author={Palowitch, John and Shabalin, Andrey and Zhou, Yi-Hui and Nobel, Andrew B. and Wright, Fred A.}, year={2018}, month={Jun}, pages={616–625} } @article{aguet_brown_castel_davis_he_jo_mohammadi_park_parsana_segrè_et al._2017, title={Genetic effects on gene expression across human tissues}, volume={550}, ISSN={0028-0836 1476-4687}, url={http://dx.doi.org/10.1038/NATURE24277}, DOI={10.1038/NATURE24277}, abstractNote={Abstract Characterization of the molecular function of the human genome and its variation across individuals is essential for identifying the cellular mechanisms that underlie human genetic traits and diseases. The Genotype-Tissue Expression (GTEx) project aims to characterize variation in gene expression levels across individuals and diverse tissues of the human body, many of which are not easily accessible. Here we describe genetic effects on gene expression levels across 44 human tissues. We find that local genetic variation affects gene expression levels for the majority of genes, and we further identify inter-chromosomal genetic effects for 93 genes and 112 loci. On the basis of the identified genetic effects, we characterize patterns of tissue specificity, compare local and distal effects, and evaluate the functional properties of the genetic effects. We also demonstrate that multi-tissue, multi-individual data can be used to identify genes and pathways affected by human disease-associated variation, enabling a mechanistic interpretation of gene regulation and the genetic basis of disease.}, number={7675}, journal={Nature}, publisher={Springer Science and Business Media LLC}, author={Aguet, François and Brown, Andrew A. and Castel, Stephane E. and Davis, Joe R. and He, Yuan and Jo, Brian and Mohammadi, Pejman and Park, YoSon and Parsana, Princy and Segrè, Ayellet V. and et al.}, editor={Battle, Alexis and Brown, Christopher D. and Engelhardt, Barbara E. and Montgomery, Stephen B.Editors}, year={2017}, month={Oct}, pages={204–213} } @article{li_kim_tsang_davis_damani_chiang_hess_zappala_strober_scott_et al._2017, title={The impact of rare variation on gene expression across tissues}, volume={550}, ISSN={0028-0836 1476-4687}, url={http://dx.doi.org/10.1038/NATURE24267}, DOI={10.1038/NATURE24267}, abstractNote={Abstract Rare genetic variants are abundant in humans and are expected to contribute to individual disease risk1,2,3,4. While genetic association studies have successfully identified common genetic variants associated with susceptibility, these studies are not practical for identifying rare variants1,5. Efforts to distinguish pathogenic variants from benign rare variants have leveraged the genetic code to identify deleterious protein-coding alleles1,6,7, but no analogous code exists for non-coding variants. Therefore, ascertaining which rare variants have phenotypic effects remains a major challenge. Rare non-coding variants have been associated with extreme gene expression in studies using single tissues8,9,10,11, but their effects across tissues are unknown. Here we identify gene expression outliers, or individuals showing extreme expression levels for a particular gene, across 44 human tissues by using combined analyses of whole genomes and multi-tissue RNA-sequencing data from the Genotype-Tissue Expression (GTEx) project v6p release12. We find that 58% of underexpression and 28% of overexpression outliers have nearby conserved rare variants compared to 8% of non-outliers. Additionally, we developed RIVER (RNA-informed variant effect on regulation), a Bayesian statistical model that incorporates expression data to predict a regulatory effect for rare variants with higher accuracy than models using genomic annotations alone. Overall, we demonstrate that rare variants contribute to large gene expression changes across tissues and provide an integrative method for interpretation of rare variants in individual genomes.}, number={7675}, journal={Nature}, publisher={Springer Science and Business Media LLC}, author={Li, Xin and Kim, Yungil and Tsang, Emily K. and Davis, Joe R. and Damani, Farhan N. and Chiang, Colby and Hess, Gaelen T. and Zappala, Zachary and Strober, Benjamin J. and Scott, Alexandra J. and et al.}, year={2017}, month={Oct}, pages={239–243} } @article{palowitch_shabalin_zhou_nobel_wright_2016, title={Estimation of interpretable eQTL effect sizes using a log of linear model}, journal={arXiv preprint arXiv:1605.08799}, author={Palowitch, John and Shabalin, Andrey and Zhou, Yihui and Nobel, Andrew B and Wright, Fred A}, year={2016} } @article{luizon_eckalbar_wang_jones_smith_laurance_lin_gallins_etheridge_wright_et al._2016, title={Genomic characterization of metformin hepatic response}, volume={12}, number={11}, journal={PLoS genetics}, publisher={Public Library of Science}, author={Luizon, Marcelo R and Eckalbar, Walter L and Wang, Yao and Jones, Stacy L and Smith, Robin P and Laurance, Megan and Lin, Lawrence and Gallins, Paul J and Etheridge, Amy S and Wright, Fred and et al.}, year={2016}, pages={e1006449} } @article{tian_patel_ridpath_chen_zhou_neo_clement_takata_takeda_sale_et al._2016, title={Homologous recombination and translesion DNA synthesis play critical roles on tolerating DNA damage caused by trace levels of hexavalent chromium}, volume={11}, number={12}, journal={PLoS One}, author={Tian, X. and Patel, K. and Ridpath, J. R. and Chen, Y. J. and Zhou, Y. H. and Neo, D. and Clement, J. and Takata, M. and Takeda, S. and Sale, J. and et al.}, year={2016} } @article{tian_patel_ridpath_chen_zhou_neo_clement_takata_takeda_sale_et al._2016, title={Homologous recombination and translesion DNA synthesis play critical roles on tolerating DNA damage caused by trace levels of hexavalent chromium}, volume={11}, number={12}, journal={PloS one}, publisher={Public Library of Science}, author={Tian, Xu and Patel, Keyur and Ridpath, John R and Chen, Youjun and Zhou, Yi-Hui and Neo, Dayna and Clement, Jean and Takata, Minoru and Takeda, Shunichi and Sale, Julian and et al.}, year={2016}, pages={e0167503} } @inproceedings{theisen_williams_2016, title={Poster: risk-based attack surface approximation}, booktitle={Symposium and Bootcamp on the Science of Security}, author={Theisen, C. and Williams, L.}, year={2016}, pages={121–123} } @article{zhou_wright_2016, title={The projack: a resampling approach to correct for ranking bias in high-throughput studies}, volume={17}, number={1}, journal={Biostatistics (Oxford, England)}, author={Zhou, Y. H. and Wright, F. A.}, year={2016}, pages={54–64} } @article{zhou_marron_2016, title={Visualization of robust L1PCA}, volume={5}, ISSN={2049-1573}, url={http://dx.doi.org/10.1002/STA4.113}, DOI={10.1002/STA4.113}, abstractNote={Robust principal components are particularly challenging to find for high‐dimensional data sets, including genomic data. Conventional principal component analysis is often unduly influenced by a few closely related family members. This phenomenon is explained using the ideas of a high‐dimensional low sample size geometric representation. These ideas further show why the earlier robust method of spherical principal components fails to solve this problem. A solution is provided, which is called the visual L1 principal component analysis (VL1PCA). This approach is based on a backwards L1‐norm best‐fit idea. VL1PCA improves upon the best previous version of L1PCA by providing interpretable scores and a scatterplot visualization of the data. Another contribution is a new notion of robust centre, the backwards L1 median. The utility of VL1PCA is illustrated on examples and a real high‐dimensional data set. Our VL1PCA is not only robust to outliers but also gives a meaningful population stratification for data even in the presence of special family structure, when other methods fail. © 2016 The Authors. Stat Published by John Wiley & Sons Ltd}, number={1}, journal={Stat}, publisher={Wiley}, author={Zhou, Yi-Hui and Marron, J. S.}, year={2016}, pages={173–184} } @article{rivas_pirinen_conrad_lek_tsang_karczewski_maller_kukurba_deluca_fromer_et al._2015, title={Effect of predicted protein-truncating genetic variants on the human transcriptome}, volume={348}, number={6235}, journal={Science}, publisher={American Association for the Advancement of Science}, author={Rivas, Manuel A and Pirinen, Matti and Conrad, Donald F and Lek, Monkol and Tsang, Emily K and Karczewski, Konrad J and Maller, Julian B and Kukurba, Kimberly R and DeLuca, David S and Fromer, Menachem and et al.}, year={2015}, pages={666–669} } @article{demerath_guan_grove_aslibekyan_mendelson_zhou_hedman_sandling_li_irvin_et al._2015, title={Epigenome-wide association study (EWAS) of BMI, BMI change and waist circumference in African American adults identifies multiple replicated loci}, volume={24}, number={15}, journal={Human Molecular Genetics}, author={Demerath, E. W. and Guan, W. H. and Grove, M. L. and Aslibekyan, S. and Mendelson, M. and Zhou, Y. H. and Hedman, A. K. and Sandling, J. K. and Li, L. A. and Irvin, M. R. and et al.}, year={2015}, pages={4464–4479} } @article{demerath_guan_grove_aslibekyan_mendelson_zhou_hedman_sandling_li_irvin_et al._2015, title={Epigenome-wide association study (EWAS) of BMI, BMI change and waist circumference in African American adults identifies multiple replicated loci}, volume={24}, number={15}, journal={Human molecular genetics}, publisher={Oxford University Press}, author={Demerath, Ellen W and Guan, Weihua and Grove, Megan L and Aslibekyan, Stella and Mendelson, Michael and Zhou, Yi-Hui and Hedman, Åsa K and Sandling, Johanna K and Li, Li-An and Irvin, Marguerite R and et al.}, year={2015}, pages={4464–4479} } @article{o’neal_gallins_pace_dang_wolf_jones_guo_zhou_madar_huang_et al._2015, title={Gene Expression in Transformed Lymphocytes Reveals Variation in Endomembrane and HLA Pathways Modifying Cystic Fibrosis Pulmonary Phenotypes}, volume={96}, ISSN={0002-9297}, url={http://dx.doi.org/10.1016/J.AJHG.2014.12.022}, DOI={10.1016/J.AJHG.2014.12.022}, abstractNote={Variation in cystic fibrosis (CF) phenotypes, including lung disease severity, age of onset of persistent Pseudomonas aeruginosa (P. aeruginosa) lung infection, and presence of meconium ileus (MI), has been partially explained by genome-wide association studies (GWASs). It is not expected that GWASs alone are sufficiently powered to uncover all heritable traits associated with CF phenotypic diversity. Therefore, we utilized gene expression association from lymphoblastoid cells lines from 754 p.Phe508del CF-affected homozygous individuals to identify genes and pathways. LPAR6, a G protein coupled receptor, associated with lung disease severity (false discovery rate q value = 0.0006). Additional pathway analyses, utilizing a stringent permutation-based approach, identified unique signals for all three phenotypes. Pathways associated with lung disease severity were annotated in three broad categories: (1) endomembrane function, containing p.Phe508del processing genes, providing evidence of the importance of p.Phe508del processing to explain lung phenotype variation; (2) HLA class I genes, extending previous GWAS findings in the HLA region; and (3) endoplasmic reticulum stress response genes. Expression pathways associated with lung disease were concordant for some endosome and HLA pathways, with pathways identified using GWAS associations from 1,978 CF-affected individuals. Pathways associated with age of onset of persistent P. aeruginosa infection were enriched for HLA class II genes, and those associated with MI were related to oxidative phosphorylation. Formal testing demonstrated that genes showing differential expression associated with lung disease severity were enriched for heritable genetic variation and expression quantitative traits. Gene expression provided a powerful tool to identify unrecognized heritable variation, complementing ongoing GWASs in this rare disease. Variation in cystic fibrosis (CF) phenotypes, including lung disease severity, age of onset of persistent Pseudomonas aeruginosa (P. aeruginosa) lung infection, and presence of meconium ileus (MI), has been partially explained by genome-wide association studies (GWASs). It is not expected that GWASs alone are sufficiently powered to uncover all heritable traits associated with CF phenotypic diversity. Therefore, we utilized gene expression association from lymphoblastoid cells lines from 754 p.Phe508del CF-affected homozygous individuals to identify genes and pathways. LPAR6, a G protein coupled receptor, associated with lung disease severity (false discovery rate q value = 0.0006). Additional pathway analyses, utilizing a stringent permutation-based approach, identified unique signals for all three phenotypes. Pathways associated with lung disease severity were annotated in three broad categories: (1) endomembrane function, containing p.Phe508del processing genes, providing evidence of the importance of p.Phe508del processing to explain lung phenotype variation; (2) HLA class I genes, extending previous GWAS findings in the HLA region; and (3) endoplasmic reticulum stress response genes. Expression pathways associated with lung disease were concordant for some endosome and HLA pathways, with pathways identified using GWAS associations from 1,978 CF-affected individuals. Pathways associated with age of onset of persistent P. aeruginosa infection were enriched for HLA class II genes, and those associated with MI were related to oxidative phosphorylation. Formal testing demonstrated that genes showing differential expression associated with lung disease severity were enriched for heritable genetic variation and expression quantitative traits. Gene expression provided a powerful tool to identify unrecognized heritable variation, complementing ongoing GWASs in this rare disease. The genetic architecture of phenotypic variability in cystic fibrosis (CF [MIM 219700]) is beginning to be defined,1Wright F.A. Strug L.J. Doshi V.K. Commander C.W. Blackman S.M. Sun L. Berthiaume Y. Cutler D. Cojocaru A. Collaco J.M. et al.Genome-wide association and linkage identify modifier loci of lung disease severity in cystic fibrosis at 11p13 and 20q13.2.Nat. Genet. 2011; 43: 539-546Crossref PubMed Scopus (190) Google Scholar, 2Green D.M. Collaco J.M. McDougal K.E. Naughton K.M. Blackman S.M. Cutting G.R. Heritability of respiratory infection with Pseudomonas aeruginosa in cystic fibrosis.J. Pediatr. 2012; 161: 290-295.e1Abstract Full Text Full Text PDF PubMed Scopus (29) Google Scholar, 3Emond M.J. Louie T. Emerson J. Zhao W. Mathias R.A. Knowles M.R. Wright F.A. Rieder M.J. Tabor H.K. Nickerson D.A. et al.National Heart, Lung, and Blood Institute (NHLBI) GO Exome Sequencing ProjectLung GOExome sequencing of extreme phenotypes identifies DCTN4 as a modifier of chronic Pseudomonas aeruginosa infection in cystic fibrosis.Nat. Genet. 2012; 44: 886-889Crossref PubMed Scopus (170) Google Scholar, 4Li W. Soave D. Miller M.R. Keenan K. Lin F. Gong J. Chiang T. Stephenson A.L. Durie P. Rommens J. et al.Unraveling the complex genetic model for cystic fibrosis: pleiotropic effects of modifier genes on early cystic fibrosis-related morbidities.Hum. Genet. 2014; 133: 151-161Crossref PubMed Scopus (82) Google Scholar, 5Sun L. Rommens J.M. Corvol H. Li W. Li X. Chiang T.A. Lin F. Dorfman R. Busson P.F. Parekh R.V. et al.Multiple apical plasma membrane constituents are associated with susceptibility to meconium ileus in individuals with cystic fibrosis.Nat. Genet. 2012; 44: 562-569Crossref PubMed Scopus (150) Google Scholar but GWASs for CF are limited by numbers of subjects compared to common diseases, where tens of thousands of subjects have been used to identify pathophysiologically relevant pathways.6Okada Y. Wu D. Trynka G. Raj T. Terao C. Ikari K. Kochi Y. Ohmura K. Suzuki A. Yoshida S. et al.RACI consortiumGARNET consortiumGenetics of rheumatoid arthritis contributes to biology and drug discovery.Nature. 2014; 506: 376-381Crossref PubMed Scopus (1418) Google Scholar, 7Bønnelykke K. Matheson M.C. Pers T.H. Granell R. Strachan D.P. Alves A.C. Linneberg A. Curtin J.A. Warrington N.M. Standl M. et al.Australian Asthma Genetics Consortium (AAGC)EArly Genetics and Lifecourse Epidemiology (EAGLE) ConsortiumMeta-analysis of genome-wide association studies identifies ten loci influencing allergic sensitization.Nat. Genet. 2013; 45: 902-906Crossref PubMed Scopus (190) Google Scholar, 8Jostins L. Ripke S. Weersma R.K. Duerr R.H. McGovern D.P. Hui K.Y. Lee J.C. Schumm L.P. Sharma Y. Anderson C.A. et al.International IBD Genetics Consortium (IIBDGC)Host-microbe interactions have shaped the genetic architecture of inflammatory bowel disease.Nature. 2012; 491: 119-124Crossref PubMed Scopus (3230) Google Scholar Studies of gene expression provide an alternative approach to identify gene modifiers.9Cookson W. Liang L. Abecasis G. Moffatt M. Lathrop M. Mapping complex disease traits with global gene expression.Nat. Rev. Genet. 2009; 10: 184-194Crossref PubMed Scopus (612) Google Scholar, 10Emilsson V. Thorleifsson G. Zhang B. Leonardson A.S. Zink F. Zhu J. Carlson S. Helgason A. Walters G.B. Gunnarsdottir S. et al.Genetics of gene expression and its effect on disease.Nature. 2008; 452: 423-428Crossref PubMed Scopus (1007) Google Scholar, 11Nica A.C. Montgomery S.B. Dimas A.S. Stranger B.E. Beazley C. Barroso I. Dermitzakis E.T. Candidate causal regulatory effects by integration of expression QTLs with complex trait genetic associations.PLoS Genet. 2010; 6: e1000895Crossref PubMed Scopus (313) Google Scholar Based upon the established utility of gene expression studies in lymphoblastoid cell lines (LCLs),12Nicolae D.L. Gamazon E. Zhang W. Duan S. Dolan M.E. Cox N.J. Trait-associated SNPs are more likely to be eQTLs: annotation to enhance discovery from GWAS.PLoS Genet. 2010; 6: e1000888Crossref PubMed Scopus (910) Google Scholar, 13Stranger B.E. Forrest M.S. Dunning M. Ingle C.E. Beazley C. Thorne N. Redon R. Bird C.P. de Grassi A. Lee C. et al.Relative impact of nucleotide and copy number variation on gene expression phenotypes.Science. 2007; 315: 848-853Crossref PubMed Scopus (1347) Google Scholar, 14Zhang W. Duan S. Kistner E.O. Bleibel W.K. Huang R.S. Clark T.A. Chen T.X. Schweitzer A.C. Blume J.E. Cox N.J. Dolan M.E. Evaluation of genetic variation contributing to differences in gene expression between populations.Am. J. Hum. Genet. 2008; 82: 631-640Abstract Full Text Full Text PDF PubMed Scopus (157) Google Scholar global gene expression was measured from LCLs of a highly phenotyped CF cohort previously used for GWAS analysis1Wright F.A. Strug L.J. Doshi V.K. Commander C.W. Blackman S.M. Sun L. Berthiaume Y. Cutler D. Cojocaru A. Collaco J.M. et al.Genome-wide association and linkage identify modifier loci of lung disease severity in cystic fibrosis at 11p13 and 20q13.2.Nat. Genet. 2011; 43: 539-546Crossref PubMed Scopus (190) Google Scholar and analyzed for association with three distinct CF phenotypes: lung disease severity, age of onset of persistent Pseudomonas aeruginosa (P. aeruginosa) pulmonary infection, and meconium ileus (MI [MIM 614665]) at birth (Table 1; Figure S1).Table 1Characteristics of Subject Population for PhenotypesStudy GroupConsortium Lung Phenotype (Primary Analysis)aSubjects were classified as having either severe or mild lung disease, as defined by the quantitative Consortium lung phenotype (KNoRMA) value of <0.3 or >0.3, respectively.18Age of Onset of Persistent Pseudomonas aeruginosaMeconium Ileus (MI)Size of PopulationAge at Enrollment (year)No. Males (%)No. EuropeanbBased on self-identified ancestry and principal components analysis via SNP genotypes. (%)Persistent Culture PositivecData were obtained at the encounter level (each clinic visit) from the Cystic Fibrosis Foundation (CFF) Patient Registry. Persistent is defined as cultured P. aeruginosa in respiratory cultures 2 years in a row, or 2 out of 3 years, unless subjects had at least 5 consecutive years of negative cultures after meeting minimal criteria (2 out of 3 years of positive cultures). Subjects who were above age 7 needed to have a negative culture before the first positive culture to be included into the analysis.28 There were 14 severe and 30 mild subjects who were negative for P. aeruginosa at last culture. (%)Age of Onset (year)Presence of MIdSubjects were confirmed to have MI if a diagnosis at birth was supported by source documents, such as the original surgical or medical report, detailed clinical or admissions note, or verbal confirmation from the subject or the parent with documentation of an abdominal scar. Subjects were removed from the analysis if MI could not be confirmed or if the diagnosis was unclear or unknown. (%)Mean ± SDRangeMean ± SDRangeSevere31716.5 ± 4.68–25157 (49.5)317 (100)208 of 222 (93.7)5.2 ± 4.30.6–1952 of 301 (17.3)Mild43728.0 ± 9.915–58221 (50.5)437 (100)203 of 233 (87.1)16.8 ± 10.30.6–5754 of 405 (13.3)ePresence of MI was 17.6% (36 of 205) for subjects enrolled at 15–25 years of age.Total754455706a Subjects were classified as having either severe or mild lung disease, as defined by the quantitative Consortium lung phenotype (KNoRMA) value of <0.3 or >0.3, respectively.18Taylor C. Commander C.W. Collaco J.M. Strug L.J. Li W. Wright F.A. Webel A.D. Pace R.G. Stonebraker J.R. Naughton K. et al.A novel lung disease phenotype adjusted for mortality attrition for cystic fibrosis genetic modifier studies.Pediatr. Pulmonol. 2011; 46: 857-869Crossref PubMed Scopus (41) Google Scholarb Based on self-identified ancestry and principal components analysis via SNP genotypes.c Data were obtained at the encounter level (each clinic visit) from the Cystic Fibrosis Foundation (CFF) Patient Registry. Persistent is defined as cultured P. aeruginosa in respiratory cultures 2 years in a row, or 2 out of 3 years, unless subjects had at least 5 consecutive years of negative cultures after meeting minimal criteria (2 out of 3 years of positive cultures). Subjects who were above age 7 needed to have a negative culture before the first positive culture to be included into the analysis.28Pittman J.E. Calloway E.H. Kiser M. Yeatts J. Davis S.D. Drumm M.L. Schechter M.S. Leigh M.W. Emond M. Van Rie A. Knowles M.R. Age of Pseudomonas aeruginosa acquisition and subsequent severity of cystic fibrosis lung disease.Pediatr. Pulmonol. 2011; 46: 497-504PubMed Google Scholar There were 14 severe and 30 mild subjects who were negative for P. aeruginosa at last culture.d Subjects were confirmed to have MI if a diagnosis at birth was supported by source documents, such as the original surgical or medical report, detailed clinical or admissions note, or verbal confirmation from the subject or the parent with documentation of an abdominal scar. Subjects were removed from the analysis if MI could not be confirmed or if the diagnosis was unclear or unknown.e Presence of MI was 17.6% (36 of 205) for subjects enrolled at 15–25 years of age. Open table in a new tab Affymetrix Human Exon (1.0 ST) microarray data were collected from RNA isolated from 754 LCLs selected from a cohort of 1,137 samples from CFTR (MIM 602421) p.Phe508del European individuals homozygous for the mutation (chr7: 98,809–98,811 delCTT; RefSeq accession number NG_016465.3; c.1521_1523delCTT). These CF-affected individuals were originally obtained for the Genetic Modifiers in CF Lung Disease Study where a GWAS had been performed1Wright F.A. Strug L.J. Doshi V.K. Commander C.W. Blackman S.M. Sun L. Berthiaume Y. Cutler D. Cojocaru A. Collaco J.M. et al.Genome-wide association and linkage identify modifier loci of lung disease severity in cystic fibrosis at 11p13 and 20q13.2.Nat. Genet. 2011; 43: 539-546Crossref PubMed Scopus (190) Google Scholar (Figure S2). Considerable efforts were taken to ensure that high-quality microarray data were utilized and that interpretation would not be confused by known effect of SNPs on probe hybridization kinetics (Figure S2). For the highly polymorphic HLA region, probe set filtering removed 438 of the 797 probe sets. However, because of the concern that probe set filtering might not have been adequate in HLA genes, additional analysis was performed to identify HLA genes whose expression values were probably affected by probe set binding (Figure S3). As a result of this analysis, HLA-DRB1 (MIM 142857) expression values were removed from subsequent analysis. The study was approved by the biomedical institutional review board of the University of North Carolina and the institutional review board of each participating institution. CF-affected individuals and their parents (if they were a minor) provided written informed consent. Linear regression was utilized to establish association of gene expression with phenotypes. Gene expression values meeting a minimal threshold of expression above 6.03 (on the Affymetrix RMA standard log2 scale) were utilized, based on the 95th percentile of mean "expression" in females for genes on the Y chromosome, because this threshold was considered to reliably represent true signal above background. All genes meeting this criterion (12,033 out of 17,868 annotated genes; 67.3%) were included in the linear regression analysis, including genes whose probes overlaid SNPs with high minor allele frequency (MAF), but these genes were "flagged" so that potentially important interpretive issues could be considered later. The covariates used for all analyses are listed in Table S1. The genotype PCs used as covariates were calculated with Eigenstrat15Price A.L. Patterson N.J. Plenge R.M. Weinblatt M.E. Shadick N.A. Reich D. Principal components analysis corrects for stratification in genome-wide association studies.Nat. Genet. 2006; 38: 904-909Crossref PubMed Scopus (6831) Google Scholar and available genotype data from the previously conducted GWASs.1Wright F.A. Strug L.J. Doshi V.K. Commander C.W. Blackman S.M. Sun L. Berthiaume Y. Cutler D. Cojocaru A. Collaco J.M. et al.Genome-wide association and linkage identify modifier loci of lung disease severity in cystic fibrosis at 11p13 and 20q13.2.Nat. Genet. 2011; 43: 539-546Crossref PubMed Scopus (190) Google Scholar The surrogate variables of gene expression data were calculated with the "sva" package in Bioconductor in R.16Leek J.T. Johnson W.E. Parker H.S. Jaffe A.E. Storey J.D. The sva package for removing batch effects and other unwanted variation in high-throughput experiments.Bioinformatics. 2012; 28: 882-883Crossref PubMed Scopus (2291) Google Scholar The Q-Q plots for all three phenotypes suggested that the covariates included were appropriate to control for population stratification or technical factors that could potentially lead to false positives (Figure S4). The expression of lysophosphatidic acid receptor 6 (LPAR6 [MIM 278150]) achieved transcriptome-wide significance for association with lung disease (false discovery rate q value = 0.0006, p value = 5.35 × 10−8), using both standard and alternative probe annotation (ANNMAP, formerly known as X:MAP),17Yates T. Okoniewski M.J. Miller C.J. X:Map: annotation and visualization of genome structure for Affymetrix exon array analysis.Nucleic Acids Res. 2008; 36: D780-D786Crossref PubMed Scopus (53) Google Scholar with higher levels of LPAR6 being associated with worse lung function. Array-based LPAR6 expression was technically validated by TaqMan quantitative real-time PCR (p < 0.0001 between 36 low-expressing and 40 high-expressing LCL samples from CF-affected individuals). CHMP4C (p = 1.05 × 10−5 [MIM 610899]), SSBP2 (p = 2.60 × 10−5 [MIM 607389]), and P2RX4 (p = 8.03 × 10−5 [MIM 600846]) were suggestive for association (Table S2; Figure S5; see Table S5 for complete list). As explicitly accounted for by the Consortium lung phenotype,18Taylor C. Commander C.W. Collaco J.M. Strug L.J. Li W. Wright F.A. Webel A.D. Pace R.G. Stonebraker J.R. Naughton K. et al.A novel lung disease phenotype adjusted for mortality attrition for cystic fibrosis genetic modifier studies.Pediatr. Pulmonol. 2011; 46: 857-869Crossref PubMed Scopus (41) Google Scholar older surviving CF-affected individuals have milder lung disease, reflecting high mortality in CF (Table 1). To investigate a possible relationship between age and gene expression in the CF cohort, but unrelated to CF lung disease, we examined three large external studies of LCL gene expression. These included a childhood asthma (MIM 600807) cohort evaluated on the Affymetrix platform,19Liang L. Morar N. Dixon A.L. Lathrop G.M. Abecasis G.R. Moffatt M.F. Cookson W.O. A cross-platform analysis of 14,177 expression quantitative trait loci derived from lymphoblastoid cell lines.Genome Res. 2013; 23: 716-726Crossref PubMed Scopus (108) Google Scholar available data from the Cholesterol and Pharmacogenomics (CAP) trial (available on ArrayExpress),20Yu C.Y. Theusch E. Lo K. Mangravite L.M. Naidoo D. Kutilova M. Medina M.W. HNRNPA1 regulates HMGCR alternative splicing and modulates cellular cholesterol metabolism.Hum. Mol. Genet. 2014; 23: 319-332Crossref PubMed Scopus (46) Google Scholar and the Multiple Tissue Human Expression Resource (MuTHER) study.21Glass D. Viñuela A. Davies M.N. Ramasamy A. Parts L. Knowles D. Brown A.A. Hedman A.K. Small K.S. Buil A. et al.UK Brain Expression consortiumMuTHER consortiumGene expression changes with age in skin, adipose tissue, blood and brain.Genome Biol. 2013; 14: R75Crossref PubMed Scopus (189) Google Scholar No correspondence emerged between differentially expressed genes for the Consortium lung phenotype and those associated with age in these three non-CF populations (Figure S6), although LPAR6 was nominally associated with age (not corrected for multiple comparison) in older women (age ∼59 years) in the MuTHER study.21Glass D. Viñuela A. Davies M.N. Ramasamy A. Parts L. Knowles D. Brown A.A. Hedman A.K. Small K.S. Buil A. et al.UK Brain Expression consortiumMuTHER consortiumGene expression changes with age in skin, adipose tissue, blood and brain.Genome Biol. 2013; 14: R75Crossref PubMed Scopus (189) Google Scholar Consequently, we conclude that the associations seen in our study reflect CF lung disease severity and not aging. Rigorous "pathway" (gene set) analysis was conducted via a permutation-based approach (Significance Analysis of Function and Expression; SAFE), which accounts for gene expression correlation structures and allows testing of both standard and custom-derived pathways.22Barry W.T. Nobel A.B. Wright F.A. Significance analysis of functional categories in gene expression studies: a structured permutation approach.Bioinformatics. 2005; 21: 1943-1949Crossref PubMed Scopus (253) Google Scholar Pathway analysis was conducted by SAFE in R (v.3.0) and annotation databases (available at Bioconductor) hugene10stprobeset.db and GO.db (Gene Ontology annotation maps). Multiple pathways with q values < 0.15 were found to associate with lung disease severity (Table 2; Table S6, tab A). Of the 35 pathways listed (Table 2), 16 were related to the endomembrane system for synthesis and post-translational modification of membrane proteins (membranes, vesicle traffic, and Golgi/endoplasmic reticulum [ER]) and two pathways were related to ER stress response, which also could represent a subset of endomembrane processes. Of the 11 Gene Ontology (GO) Cellular Component pathways, 7 contained HLA class I genes, and custom-derived pathways consisting exclusively of HLA genes were also highly significant (Table 2). Importantly, although the HLA genes clearly contributed to the significance of the endomembrane pathways, these same pathways also contained TTC35 (Table 2 [MIM 607722]) and TMEM85 (Table S6, tab A; p value = 0.06), which are the human homologs of yeast genes EMC2 and EMC4, respectively, known to modulate yeast homolog of p.Phe508del processing.23Louie R.J. Guo J. Rodgers J.W. White R. Shah N. Pagant S. Kim P. Livstone M. Dolinski K. McKinney B.A. et al.A yeast phenomic model for the gene interaction network modulating CFTR-ΔF508 protein biogenesis.Genome Med. 2012; 4: 103Crossref PubMed Scopus (55) Google Scholar MetaMiner Cystic Fibrosis Specific Pathways not containing HLA genes also supported association with p.Phe508del processing (Table 2). We conclude that three important pathophysiological signals have emerged: HLA class I, p.Phe508del processing, and the ER stress response. The significance of the miR21 (miRNA-21 [MIM 611020]) pathway is also relevant given the expanding role of this microRNA (miRNA) in pulmonary biology.24Kumarswamy R. Volkmann I. Thum T. Regulation and function of miRNA-21 in health and disease.RNA Biol. 2011; 8: 706-713Crossref PubMed Scopus (444) Google Scholar Most pathways trended in the "up" direction (increased expression of genes in the pathways associated with milder lung disease), with two pathways (annotated to germ cell nuclei) trending "down."Table 2Gene Expression Pathways Significantly Associated with Consortium Lung PhenotypePathwayGenesStatisticsIDNameNumber↑aNumber of genes in pathway with increased expression.↓bNumber of genes in pathway with decreased expression.TrendcUp (increased) or down (decreased) differential expression of genes in the pathways associated with milder lung disease. Two-sided indicates pathways that contained both increased and decreased differentially expressed genes that contributed significantly to the signal.p ValuedDetermined by 10,000 permutations in the SAFE package.22q ValueeBenjamini-Hochberg false-discovery for pathways testing within each pathway set; q values < 0.15 were included.Genes with Gene-Level p Value < 0.05 (Ordered by p Value)fSee Table S6 (tab A) for the inclusive list of genes for these pathways; "none" indicates that no individual genes within the pathway had a p value less than 0.05; see Table S5 for gene MIM numbers.GO Cellular Component Pathways0001673male germ cell nucleus14014down0.00010.0164TNP1; REC8; TCFL50012507ER to Golgi transport vesicle membrane25232up0.00030.0481HLA-E; MCFD2; TMED7; HLA-F0043073germ cell nucleus17116down0.00040.0442TNP1; REC8; TCFL50042470;0048770melanosome; pigment granule785226up0.00070.0582SLC3A2; TPP1; CTSD; ANXA2; STOM; HSPA5; BSG0030134ER to Golgi transport vesicle29254up0.00110.0737HLA-E; MCFD2; TMED7; HLA-F0030176integral to endoplasmic reticulum membrane856421up0.00240.1181TTC35; HLA-E; EDEM1; TAP1; SELS; HLA-F; HSPA5; MMGT10031301integral to organelle membrane17111358up0.00260.1181TTC35; HLA-E; EDEM1; TAP1; SELS; ST6GALNAC6; HLA-F; A4GALT; ARMCX3; P2RX7; LARGE; HSPA5; MMGT10000421autophagic vacuole membrane13112up0.00280.1181WIPI1; ATG9A0031227intrinsic to endoplasmic reticulum membrane957025up0.00310.1181TTC35; HLA-E; EDEM1; TAP1; SELS; HLA-F; HSPA5; MMGT10031300intrinsic to organelle membrane18412163up0.00360.1231TTC35; HLA-E; EDEM1; TAP1; SELS; ST6GALNAC6; HLA-F; A4GALT; ARMCX3; P2RX7; LARGE; HSPA5; MMGT10030658transport vesicle membrane493316up0.00390.1231HLA-E; MCFD2; TMED7; HLA-F; NCALDGO Biological Process Pathways0006518peptide metabolic process644618up0.00010.0837GSTK1; DNPEP; PSEN2; TPP10072384organelle transport along microtubule24213up0.00010.0837PRKCZ; COPG0006925inflammatory cell apoptotic process10100up0.00030.1107none0006944cellular membrane fusion614219up0.00030.1107CD9; PLDN; ANXA2; BET10007030golgi organization382810up0.00030.1107GCC2; BHLHA15; GOLGB1; PLK3; COG1; TMED20043603cellular amide metabolic process1016536up0.00030.1107GSTK1; DNPEP; PSEN2; TPP1; PRKCD0034067protein localization to Golgi apparatus14131up0.00040.1166GOLGA4; GCC2; ATG9A0045684positive regulation of epidermis development11101up0.00040.1166noneGO Molecular Function Pathways0050839cell adhesion molecule binding331518two sided0.00040.1181P2RX4; MLLT4; CD1D;gFor the two-sided "Trend," these genes have a "down" trend. CTNNA1; PVRL1gFor the two-sided "Trend," these genes have a "down" trend.0042287MHC protein binding1596two sided0.00060.1191TAP1; LAG3; MARCH8MSigDB PathwaysATAAGCT.MIR.21814536two sided0.00010.0387BAHD1; BTBD3;gFor the two-sided "Trend," these genes have a "down" trend. C5orf41; STK40; UBR3; NF2;gFor the two-sided "Trend," these genes have a "down" trend. SSFA2; JAG1; PPARA; PELI1; RHOB; CREBL2V.HMGIY_Q61587088two sided0.00060.1499ZNF675;gFor the two-sided "Trend," these genes have a "down" trend. LMO4; TNFSF11;gFor the two-sided "Trend," these genes have a "down" trend. PLAGL2; POLD3;gFor the two-sided "Trend," these genes have a "down" trend. SLC7A1; UBE2E2;gFor the two-sided "Trend," these genes have a "down" trend. TAZ; UBR3; MRC2;gFor the two-sided "Trend," these genes have a "down" trend. TNFSF4; IKZF2gFor the two-sided "Trend," these genes have a "down" trend.MetaMiner Cystic Fibrosis Specific PathwayshMetaMiner CF Specific Pathways represent a version of Thomson Reuters' (formerly GeneGo) MetaDiscovery suite that is enriched with content specific for cystic fibrosis.cholesterol and sphingolipids transport/recycling to plasma membrane in lung (normal and CF)1495two sided0.00360.0597ABCG1gFor the two-sided "Trend," these genes have a "down" trend.normal wtCFTR traffic/sorting endosome formation14113up0.00520.0621noneF508-CFTR traffic/ER-to-Golgi in CF; Normal wtCFTR traffic/ER-to-Golgi22202up0.00750.0621COPG; COPZ2mucin expression in CF via TLRs, EGFR signaling pathways483414up0.01160.0770JUN; PRKCDPFAM Pathways00035double-stranded RNA binding motif17215down0.00010.0135STRBP; STAU207716basic region leucine zipper1174two sided0.00020.0276DDIT3; CREBL2; CEBPB03953tubulin C-terminal domain15213down0.00090.0804TUBB2BCF Relevant Custom PathwaysER stress response16912742up0.00050.0106DNAJB9; EDEM1; CISD2; TANK; DDIT3; SERP1; FDPS; LONP1; NANS; SSR4; JUN; GADD45A; LY9; PGM3; HSPA5; ARF4; IER3IP1; BTG2; CEBPB; CNIH; MANF; PDIA6XBP1 target genes13103two sided0.00790.1165DNAJB9; EDEM1; SERP1; PDIA6HLA-Specific Pathwaysclass I330up0.02210.0261HLA-E; HLA-Fclass II871up0.08680.0580noneclass I and class II11101up0.02990.0261HLA-E; HLA-FPathways limited to those with ≥10 but ≤200 genes. SAFE analysis utilized 10,000 permutations to establish significance thresholds. CF Relevant Custom Pathways developed primarily as described for mice46Saini Y. Dang H. Livraghi-Butrico A. Kelly E.J. Jones L.C. O'Neal W.K. Boucher R.C. Gene expression in whole lung and pulmonary macrophages reflects the dynamic pathology associated with airway surface dehydration.BMC Genomics. 2014; 15: 726Crossref PubMed Scopus (34) Google Scholar using human gene counterparts (Table S8).a Number of genes in pathway with increased expression.b Number of genes in pathway with decreased expression.c Up (increased) or down (decreased) differential expression of genes in the pathways associated with milder lung disease. Two-sided indicates pathways that contained both increased and decreased differentially expressed genes that contributed significantly to the signal.d Determined by 10,000 permutations in the SAFE package.22Barry W.T. Nobel A.B. Wright F.A. Significance analysis of functional categories in gene expression studies: a structured permutation approach.Bioinformatics. 2005; 21: 1943-1949Crossref PubMed Scopus (253) Google Scholare Benjamini-Hochberg false-discovery for pathways testing within each pathway set; q values < 0.15 were included.f See Table S6 (tab A) for the inclusive list of genes for these pathways; "none" indicates that no individual genes within the pathway had a p value less than 0.05; see Table S5 for gene MIM numbers.g For the two-sided "Trend," these genes have a "down" trend.h MetaMiner CF Specific Pathways represent a version of Thomson Reuters' (formerly GeneGo) MetaDiscovery suite that is enriched with content specific for cystic fibrosis. Open table in a new tab Pathways limited to those with ≥10 but ≤200 genes. SAFE analysis utilized 10,000 permutations to establish s}, number={2}, journal={The American Journal of Human Genetics}, publisher={Elsevier BV}, author={O’Neal, Wanda K. and Gallins, Paul and Pace, Rhonda G. and Dang, Hong and Wolf, Whitney E. and Jones, Lisa C. and Guo, XueLiang and Zhou, Yi-Hui and Madar, Vered and Huang, Jinyan and et al.}, year={2015}, month={Feb}, pages={318–328} } @article{o’neal_gallins_pace_dang_wolf_jones_guo_zhou_madar_huang_et al._2015, title={Gene expression in transformed lymphocytes reveals variation in endomembrane and HLA pathways modifying cystic fibrosis pulmonary phenotypes}, volume={96}, number={2}, journal={The American Journal of Human Genetics}, publisher={Cell Press}, author={O’Neal, Wanda K and Gallins, Paul and Pace, Rhonda G and Dang, Hong and Wolf, Whitney E and Jones, Lisa C and Guo, XueLiang and Zhou, Yi-Hui and Madar, Vered and Huang, Jinyan and et al.}, year={2015}, pages={318–328} } @article{corvol_blackman_boëlle_gallins_pace_stonebraker_accurso_clement_collaco_dang_et al._2015, title={Genome-wide association meta-analysis identifies five modifier loci of lung disease severity in cystic fibrosis}, volume={6}, ISSN={2041-1723}, url={http://dx.doi.org/10.1038/NCOMMS9382}, DOI={10.1038/NCOMMS9382}, abstractNote={AbstractThe identification of small molecules that target specific CFTR variants has ushered in a new era of treatment for cystic fibrosis (CF), yet optimal, individualized treatment of CF will require identification and targeting of disease modifiers. Here we use genome-wide association analysis to identify genetic modifiers of CF lung disease, the primary cause of mortality. Meta-analysis of 6,365 CF patients identifies five loci that display significant association with variation in lung disease. Regions on chr3q29 (MUC4/MUC20; P=3.3 × 10−11), chr5p15.3 (SLC9A3; P=6.8 × 10−12), chr6p21.3 (HLA Class II; P=1.2 × 10−8) and chrXq22-q23 (AGTR2/SLC6A14; P=1.8 × 10−9) contain genes of high biological relevance to CF pathophysiology. The fifth locus, on chr11p12-p13 (EHF/APIP; P=1.9 × 10−10), was previously shown to be associated with lung disease. These results provide new insights into potential targets for modulating lung disease severity in CF.}, number={1}, journal={Nature Communications}, publisher={Springer Science and Business Media LLC}, author={Corvol, Harriet and Blackman, Scott M. and Boëlle, Pierre-Yves and Gallins, Paul J. and Pace, Rhonda G. and Stonebraker, Jaclyn R. and Accurso, Frank J. and Clement, Annick and Collaco, Joseph M. and Dang, Hong and et al.}, year={2015}, month={Sep} } @article{corvol_blackman_boelle_gallins_pace_stonebraker_accurso_clement_collaco_dang_et al._2015, title={Genome-wide association meta-analysis identifies five modifier loci of lung disease severity in cystic fibrosis}, volume={6}, journal={Nature Communications}, author={Corvol, H. and Blackman, S. M. and Boelle, P. Y. and Gallins, P. J. and Pace, R. G. and Stonebraker, J. R. and Accurso, F. J. and Clement, A. and Collaco, J. M. and Dang, H. and et al.}, year={2015} } @article{corvol_blackman_boëlle_gallins_pace_stonebraker_accurso_clement_collaco_dang_et al._2015, title={Genome-wide association meta-analysis identifies five modifier loci of lung disease severity in cystic fibrosis}, volume={6}, journal={Nature communications}, publisher={Nature Publishing Group}, author={Corvol, Harriet and Blackman, Scott M and Boëlle, Pierre-Yves and Gallins, Paul J and Pace, Rhonda G and Stonebraker, Jaclyn R and Accurso, Frank J and Clement, Annick and Collaco, Joseph M and Dang, Hong and et al.}, year={2015}, pages={8382} } @article{zhou_marron_2015, title={High dimension low sample size asymptotics of robust PCA}, volume={9}, ISSN={["1935-7524"]}, DOI={10.1214/15-ejs992}, abstractNote={: Conventional principal component analysis is highly suscepti- ble to outliers. In particular, a sufficiently outlying single data point, can draw the leading principal component toward itself. In this paper, we study the effects of outliers for high dimension and low sample size data, using asymptotics. The non-robust nature of conventional principal component analysis is verified through inconsistency under multivariate Gaussian assumptions with a single spike in the covariance structure, in the presence of a contaminating outlier. In the same setting, the robust method of spherical principal components is consistent with the population eigenvector for the spike model, even in the presence of contamination.}, number={1}, journal={ELECTRONIC JOURNAL OF STATISTICS}, author={Zhou, Yi-Hui and Marron, J. S.}, year={2015}, pages={204–218} } @article{zhou_marron_others_2015, title={High dimension low sample size asymptotics of robust PCA}, volume={9}, number={1}, journal={Electronic Journal of Statistics}, publisher={The Institute of Mathematical Statistics and the Bernoulli Society}, author={Zhou, Yi-Hui and Marron, JS and others}, year={2015}, pages={204–218} } @article{zhou_wright_2015, title={Hypothesis testing at the extremes: fast and robust association for high-throughput data}, volume={16}, ISSN={["1468-4357"]}, DOI={10.1093/biostatistics/kxv007}, abstractNote={A number of biomedical problems require performing many hypothesis tests, with an attendant need to apply stringent thresholds. Often the data take the form of a series of predictor vectors, each of which must be compared with a single response vector, perhaps with nuisance covariates. Parametric tests of association are often used, but can result in inaccurate type I error at the extreme thresholds, even for large sample sizes. Furthermore, standard two-sided testing can reduce power compared with the doubled [Formula: see text]-value, due to asymmetry in the null distribution. Exact (permutation) testing is attractive, but can be computationally intensive and cumbersome. We present an approximation to exact association tests of trend that is accurate and fast enough for standard use in high-throughput settings, and can easily provide standard two-sided or doubled [Formula: see text]-values. The approach is shown to be equivalent under permutation to likelihood ratio tests for the most commonly used generalized linear models (GLMs). For linear regression, covariates are handled by working with covariate-residualized responses and predictors. For GLMs, stratified covariates can be handled in a manner similar to exact conditional testing. Simulations and examples illustrate the wide applicability of the approach. The accompanying mcc package is available on CRAN http://cran.r-project.org/web/packages/mcc/index.html.}, number={3}, journal={BIOSTATISTICS}, publisher={Oxford University Press}, author={Zhou, Yi-Hui and Wright, Fred A.}, year={2015}, month={Jul}, pages={611–625} } @article{rager_tilley_tulenko_smeester_ray_yosim_currier_ishida_gonzález-horta_sánchez-ramírez_et al._2015, title={Identification of Novel Gene Targets and Putative Regulators of Arsenic-Associated DNA Methylation in Human Urothelial Cells and Bladder Cancer}, volume={28}, ISSN={0893-228X 1520-5010}, url={http://dx.doi.org/10.1021/TX500393Y}, DOI={10.1021/TX500393Y}, abstractNote={There is strong epidemiologic evidence linking chronic exposure to inorganic arsenic (iAs) to myriad adverse health effects, including cancer of the bladder. We set out to identify DNA methylation patterns associated with arsenic and its metabolites in exfoliated urothelial cells (EUCs) that originate primarily from the urinary bladder, one of the targets of arsenic-induced carcinogenesis. Genome-wide, gene-specific promoter DNA methylation levels were assessed in EUCs from 46 residents of Chihuahua, Mexico, and the relationship was examined between promoter methylation profiles and the intracellular concentrations of total arsenic and arsenic species. A set of 49 differentially methylated genes was identified with increased promoter methylation associated with EUC tAs, iAs, and/or monomethylated As (MMAs) enriched for their roles in metabolic disease and cancer. Notably, no genes had differential methylation associated with EUC dimethylated As (DMAs), suggesting that DMAs may influence DNA methylation-mediated urothelial cell responses to a lesser extent than iAs or MMAs. Further analysis showed that 22 of the 49 arsenic-associated genes (45%) are also differentially methylated in bladder cancer tissue identified using The Cancer Genome Atlas repository. Both the arsenic- and cancer-associated genes are enriched for the binding sites of common transcription factors known to play roles in carcinogenesis, demonstrating a novel potential mechanistic link between iAs exposure and bladder cancer.}, number={6}, journal={Chemical Research in Toxicology}, publisher={American Chemical Society (ACS)}, author={Rager, Julia E. and Tilley, Sloane K. and Tulenko, Samantha E. and Smeester, Lisa and Ray, Paul D. and Yosim, Andrew and Currier, Jenna M. and Ishida, María C. and González-Horta, Maria del Carmen and Sánchez-Ramírez, Blanca and et al.}, year={2015}, month={Jun}, pages={1144–1155} } @article{rager_tilley_tulenko_smeester_ray_yosim_currier_ishida_gonzalez-horta_sanchez-ramirez_et al._2015, title={Identification of novel gene targets and putative regulators of arsenic-associated DNA methylation in human urothelial cells and bladder cancer}, volume={28}, number={6}, journal={Chemical Research in Toxicology}, author={Rager, J. E. and Tilley, S. K. and Tulenko, S. E. and Smeester, L. and Ray, P. D. and Yosim, A. and Currier, J. M. and Ishida, M. C. and Gonzalez-Horta, M. D. and Sanchez-Ramirez, B. and et al.}, year={2015}, pages={1144–1155} } @article{identification of novel gene targets and putative regulators of arsenic-associated dna methylation in human urothelial cells and bladder cancer_2015, volume={28}, number={6}, journal={Chemical research in toxicology}, publisher={American Chemical Society}, year={2015}, pages={1144–1155} } @article{metabolomic evaluation of neutrophilic airway inflammation in cystic fibrosis_2015, volume={148}, number={2}, journal={Chest}, publisher={Elsevier}, year={2015}, pages={507–515} } @article{zhou_2016, title={Pathway Analysis for RNA-Seq Data Using a Score-Based Approach}, volume={72}, ISSN={["1541-0420"]}, DOI={10.1111/biom.12372}, abstractNote={Summary A variety of pathway/gene-set approaches have been proposed to provide evidence of higher-level biological phenomena in the association of expression with experimental condition or clinical outcome. Among these approaches, it has been repeatedly shown that resampling methods are far preferable to approaches that implicitly assume independence of genes. However, few approaches have been optimized for the specific characteristics of RNA-Seq transcription data, in which mapped tags produce discrete counts with varying library sizes, and with potential outliers or skewness patterns that violate parametric assumptions. We describe transformations to RNA-Seq data to improve power for linear associations with outcome and flexibly handle normalization factors. Using these transformations or alternate transformations, we apply recently developed null approximations to quadratic form statistics for both self-contained and competitive pathway testing. The approach provides a convenient integrated platform for RNA-Seq pathway testing. We demonstrate that the approach provides appropriate type I error control without actual permutation and is powerful under many settings in comparison to competing approaches. Pathway analysis of data from a study of F344 vs. HIV1Tg rats, and of sex differences in lymphoblastoid cell lines from humans, strongly supports the biological interpretability of the findings.}, number={1}, journal={BIOMETRICS}, author={Zhou, Yi-Hui}, year={2016}, month={Mar}, pages={165–174} } @article{zhou_2015, title={PathwaySeq: Pathway analysis for RNA-Seq data using a score-based approach}, volume={1}, number={1000}, journal={dim (data. test)}, author={Zhou, Yihui}, year={2015}, pages={448} } @article{abdo_xia_brown_kosyk_huang_sakamuru_zhou_jack_gallins_xia_et al._2015, title={Population-Based in Vitro Hazard and Concentration-Response Assessment of Chemicals: The 1000 Genomes High-Throughput Screening Study}, volume={123}, ISSN={["1552-9924"]}, DOI={10.1289/ehp.1408775}, abstractNote={Background: Understanding of human variation in toxicity to environmental chemicals remains limited, so human health risk assessments still largely rely on a generic 10-fold factor (10½ each for toxicokinetics and toxicodynamics) to account for sensitive individuals or subpopulations. Objectives: We tested a hypothesis that population-wide in vitro cytotoxicity screening can rapidly inform both the magnitude of and molecular causes for interindividual toxicodynamic variability. Methods: We used 1,086 lymphoblastoid cell lines from the 1000 Genomes Project, representing nine populations from five continents, to assess variation in cytotoxic response to 179 chemicals. Analysis included assessments of population variation and heritability, and genome-wide association mapping, with attention to phenotypic relevance to human exposures. Results: For about half the tested compounds, cytotoxic response in the 1% most “sensitive” individual occurred at concentrations within a factor of 10½ (i.e., approximately 3) of that in the median individual; however, for some compounds, this factor was > 10. Genetic mapping suggested important roles for variation in membrane and transmembrane genes, with a number of chemicals showing association with SNP rs13120371 in the solute carrier SLC7A11, previously implicated in chemoresistance. Conclusions: This experimental approach fills critical gaps unaddressed by recent large-scale toxicity testing programs, providing quantitative, experimentally based estimates of human toxicodynamic variability, and also testable hypotheses about mechanisms contributing to interindividual variation. Citation: Abdo N, Xia M, Brown CC, Kosyk O, Huang R, Sakamuru S, Zhou YH, Jack JR, Gallins P, Xia K, Li Y, Chiu WA, Motsinger-Reif AA, Austin CP, Tice RR, Rusyn I, Wright FA. 2015. Population-based in vitro hazard and concentration–response assessment of chemicals: the 1000 Genomes high-throughput screening study. Environ Health Perspect 123:458–466; http://dx.doi.org/10.1289/ehp.1408775}, number={5}, journal={ENVIRONMENTAL HEALTH PERSPECTIVES}, author={Abdo, Nour and Xia, Menghang and Brown, Chad C. and Kosyk, Oksana and Huang, Ruili and Sakamuru, Srilatha and Zhou, Yi-Hui and Jack, John R. and Gallins, Paul and Xia, Kai and et al.}, year={2015}, month={May}, pages={458–466} } @article{abdo_xia_brown_kosyk_huang_sakamuru_zhou_jack_gallins_xia_et al._2015, title={Population-based in vitro hazard and concentration--response assessment of chemicals: the 1000 genomes high-throughput screening study}, volume={123}, number={5}, journal={Environmental health perspectives}, publisher={National Institute of Environmental Health Science}, author={Abdo, Nour and Xia, Menghang and Brown, Chad C and Kosyk, Oksana and Huang, Ruili and Sakamuru, Srilatha and Zhou, Yi-Hui and Jack, John R and Gallins, Paul and Xia, Kai and et al.}, year={2015}, pages={458} } @article{ardlie_deluca_segre_sullivan_young_gelfand_trowbridge_maller_tukiainen_lek_et al._2015, title={The Genotype-Tissue Expression (GTEx) pilot analysis: Multitissue gene regulation in humans}, volume={348}, number={6235}, journal={Science}, author={Ardlie, K. G. and DeLuca, D. S. and Segre, A. V. and Sullivan, T. J. and Young, T. R. and Gelfand, E. T. and Trowbridge, C. A. and Maller, J. B. and Tukiainen, T. and Lek, M. and et al.}, year={2015}, pages={648–660} } @article{zhou_wright_2015, title={The projack: a resampling approach to correct for ranking bias in high-throughput studies}, volume={17}, number={1}, journal={Biostatistics}, publisher={Oxford University Press}, author={Zhou, Yi-Hui and Wright, Fred A}, year={2015}, pages={54–64} } @article{sanders_smeester_rojas_debussycher_wu_wright_zhou_laine_rager_swamy_et al._2014, title={Cadmium exposure and the epigenome: Exposure-associated patterns of DNA methylation in leukocytes from mother-baby pairs}, volume={9}, number={2}, journal={Epigenetics}, publisher={Taylor & Francis}, author={Sanders, Alison and Smeester, Lisa and Rojas, Daniel and DeBussycher, Tristan and Wu, Michael and Wright, Fred and Zhou, Yi-Hui and Laine, Jessica and Rager, Julia and Swamy, Geeta and et al.}, year={2014}, pages={212–221} } @article{wright_sullivan_brooks_zou_sun_xia_madar_jansen_chung_zhou_et al._2014, title={Heritability and genomics of gene expression in peripheral blood}, volume={46}, ISSN={1061-4036 1546-1718}, url={http://dx.doi.org/10.1038/NG.2951}, DOI={10.1038/NG.2951}, abstractNote={Fred Wright, Patrick Sullivan and colleagues present the results of a large expression QTL study of peripheral blood using a classic twin design with follow-up replication in independent samples. Their results enable a more precise estimate of the heritability of gene expression and provide a useful resource for exploring the genetic control of transcription. We assessed gene expression profiles in 2,752 twins, using a classic twin design to quantify expression heritability and quantitative trait loci (eQTLs) in peripheral blood. The most highly heritable genes (∼777) were grouped into distinct expression clusters, enriched in gene-poor regions, associated with specific gene function or ontology classes, and strongly associated with disease designation. The design enabled a comparison of twin-based heritability to estimates based on dizygotic identity-by-descent sharing and distant genetic relatedness. Consideration of sampling variation suggests that previous heritability estimates have been upwardly biased. Genotyping of 2,494 twins enabled powerful identification of eQTLs, which we further examined in a replication set of 1,895 unrelated subjects. A large number of non-redundant local eQTLs (6,756) met replication criteria, whereas a relatively small number of distant eQTLs (165) met quality control and replication standards. Our results provide a new resource toward understanding the genetic control of transcription.}, number={5}, journal={Nature Genetics}, publisher={Springer Science and Business Media LLC}, author={Wright, Fred A and Sullivan, Patrick F and Brooks, Andrew I and Zou, Fei and Sun, Wei and Xia, Kai and Madar, Vered and Jansen, Rick and Chung, Wonil and Zhou, Yi-Hui and et al.}, year={2014}, month={Apr}, pages={430–437} } @article{wright_sullivan_brooks_zou_sun_xia_madar_jansen_chung_zhou_et al._2014, title={Heritability and genomics of gene expression in peripheral blood}, volume={46}, number={5}, journal={Nature Genetics}, author={Wright, F. A. and Sullivan, P. F. and Brooks, A. I. and Zou, F. and Sun, W. and Xia, K. and Madar, V. and Jansen, R. and Chung, W. I. and Zhou, Y. H. and et al.}, year={2014}, pages={430–437} } @article{wright_sullivan_brooks_zou_sun_xia_madar_jansen_chung_zhou_et al._2014, title={Heritability and genomics of gene expression in peripheral blood}, volume={46}, number={5}, journal={Nature genetics}, publisher={Nature Publishing Group}, author={Wright, Fred A and Sullivan, Patrick F and Brooks, Andrew I and Zou, Fei and Sun, Wei and Xia, Kai and Madar, Vered and Jansen, Rick and Chung, Wonil and Zhou, Yi-Hui and et al.}, year={2014}, pages={430} } @article{chiu_campbell_clewell_zhou_wright_guyton_rusyn_2014, title={Physiologically Based Pharmacokinetic (PBPK) Modeling of Interstrain Variability in Trichloroethylene Metabolism in the Mouse}, volume={122}, ISSN={["1552-9924"]}, DOI={10.1289/ehp.1307623}, abstractNote={Background: Quantitative estimation of toxicokinetic variability in the human population is a persistent challenge in risk assessment of environmental chemicals. Traditionally, interindividual differences in the population are accounted for by default assumptions or, in rare cases, are based on human toxicokinetic data. Objectives: We evaluated the utility of genetically diverse mouse strains for estimating toxicokinetic population variability for risk assessment, using trichloroethylene (TCE) metabolism as a case study. Methods: We used data on oxidative and glutathione conjugation metabolism of TCE in 16 inbred and 1 hybrid mouse strains to calibrate and extend existing physiologically based pharmacokinetic (PBPK) models. We added one-compartment models for glutathione metabolites and a two-compartment model for dichloroacetic acid (DCA). We used a Bayesian population analysis of interstrain variability to quantify variability in TCE metabolism. Results: Concentration–time profiles for TCE metabolism to oxidative and glutathione conjugation metabolites varied across strains. Median predictions for the metabolic flux through oxidation were less variable (5-fold range) than that through glutathione conjugation (10-fold range). For oxidative metabolites, median predictions of trichloroacetic acid production were less variable (2-fold range) than DCA production (5-fold range), although the uncertainty bounds for DCA exceeded the predicted variability. Conclusions: Population PBPK modeling of genetically diverse mouse strains can provide useful quantitative estimates of toxicokinetic population variability. When extrapolated to lower doses more relevant to environmental exposures, mouse population-derived variability estimates for TCE metabolism closely matched population variability estimates previously derived from human toxicokinetic studies with TCE, highlighting the utility of mouse interstrain metabolism studies for addressing toxicokinetic variability. Citation: Chiu WA, Campbell JL Jr, Clewell HJ III, Zhou YH, Wright FA, Guyton KZ, Rusyn I. 2014. Physiologically based pharmacokinetic (PBPK) modeling of interstrain variability in trichloroethylene metabolism in the mouse. Environ Health Perspect 122:456–463; http://dx.doi.org/10.1289/ehp.1307623}, number={5}, journal={ENVIRONMENTAL HEALTH PERSPECTIVES}, author={Chiu, Weihsueh A. and Campbell, Jerry L., Jr. and Clewell, Harvey J., III and Zhou, Yi-Hui and Wright, Fred A. and Guyton, Kathryn Z. and Rusyn, Ivan}, year={2014}, month={May}, pages={456–463} } @article{physiologically based pharmacokinetic (pbpk) modeling of interstrain variability in trichloroethylene metabolism in the mouse_2014, volume={122}, number={5}, journal={Environmental health perspectives}, publisher={National Institute of Environmental Health Science}, year={2014}, pages={456} } @article{zhou_dang_2014, title={RNA-Seq pathway analysis}, author={Zhou, Yi-Hui and Dang, Hong}, year={2014} } @article{zhou_barry_wright_2013, title={Empirical pathway analysis, without permutation}, number={doi:10.1093/biostatistics/kxt004}, journal={Biostatistics}, author={Zhou, Yi-Hui and Barry, William T. and Wright, Fred A.}, year={2013}, pages={1–13} } @inbook{esther_zhou_wright_boucher_2013, title={Metabolomic Evaluation Of Cystic Fibrosis Airways Disease}, booktitle={A26. PEDIATRIC CYSTIC FIBROSIS}, publisher={American Thoracic Society}, author={Esther, Charles R and Zhou, Yihui and Wright, Fred and Boucher, Richard C}, year={2013}, pages={A1165–A1165} } @article{sun_oey_zhou_2013, title={Skill-assessments of statistical and Ensemble Kalman Filter data assimilative analyses using surface and deep observations in the Gulf of Mexico}, volume={7}, ISSN={2095-0195 2095-0209}, url={http://dx.doi.org/10.1007/S11707-013-0377-8}, DOI={10.1007/S11707-013-0377-8}, number={3}, journal={Frontiers of Earth Science}, publisher={Springer Science and Business Media LLC}, author={Sun, Zhibin and Oey, Lie-Yauw and Zhou, Yi-Hui}, year={2013}, month={Jul}, pages={271–281} } @article{zhou_mayhew_sun_xu_zou_wright_2013, title={Space-time clustering and the permutation moments of quadratic forms}, volume={2}, ISSN={2049-1573}, url={http://dx.doi.org/10.1002/STA4.37}, DOI={10.1002/STA4.37}, abstractNote={AbstractThe Mantel and Knox space–time clustering statistics are popular tools to establish transmissibility of a disease and detect outbreaks. The most commonly used null distributional approximations may provide poor fits, and researchers often resort to direct sampling from the permutation distribution. However, the exact first four moments for these statistics are available, and Pearson distributional approximations are often effective. Thus, our first goals are to clarify the literature and make these tools more widely available. In addition, by rewriting terms in the statistics, we obtain the exact first four permutation moments for the most commonly used quadratic form statistics, which need not be positive definite. The extension of this work to quadratic forms greatly expands the utility of density approximations for these problems, including for high‐dimensional applications, where the statistics must be extreme in order to exceed stringent testing thresholds. We demonstrate the methods using examples from the investigation of disease transmission in cattle, the association of a gene expression pathway with breast cancer survival, regional genetic association with cystic fibrosis lung disease and hypothesis testing for smoothed local linear regression. © The Authors. Stat published by John Wiley & Sons Ltd.}, number={1}, journal={Stat}, publisher={Wiley}, author={Zhou, Yi-Hui and Mayhew, Gregory and Sun, Zhibin and Xu, Xiaolin and Zou, Fei and Wright, Fred A.}, year={2013}, month={Nov}, pages={292–302} } @article{xu_xu_zhou_2013, title={Stochastic comparisons in a price-quantity setting firm with uncertain demand and emergency procurement}, volume={22}, ISSN={1004-3756 1861-9576}, url={http://dx.doi.org/10.1007/S11518-013-5226-5}, DOI={10.1007/S11518-013-5226-5}, number={4}, journal={Journal of Systems Science and Systems Engineering}, publisher={Springer Science and Business Media LLC}, author={Xu, Minghui and Xu, Xiaolin and Zhou, Yi-hui}, year={2013}, month={Oct}, pages={401–420} } @article{xu_xu_zhou_2013, title={Stochastic comparisons in a price-quantity setting firm with uncertain demand and emergency procurement}, volume={22}, number={4}, journal={Journal of systems science and systems engineering}, publisher={Springer Berlin Heidelberg}, author={Xu, Minghui and Xu, Xiaolin and Zhou, Yi-hui}, year={2013}, pages={401–420} } @article{lock_abdo_huang_xia_kosyk_o’shea_zhou_sedykh_tropsha_austin_et al._2012, title={Quantitative high-throughput screening for chemical toxicity in a population-based in vitro model}, journal={Toxicological Sciences}, publisher={Soc Toxicology}, author={Lock, Eric F and Abdo, Nour and Huang, Ruili and Xia, Menghang and Kosyk, Oksana and O’Shea, Shannon H and Zhou, Yi-Hui and Sedykh, Alexander and Tropsha, Alexander and Austin, Christopher P and et al.}, year={2012} } @article{zhou_wright_2012, title={Simple and accurate trend tests using a permutation approximation}, publisher={bepress}, author={Zhou, Yi-Hui and Wright, Fred A}, year={2012} } @article{li_chen_liu_zhou_2013, title={Single Nucleotide Polymorphism (SNP) Detection and Genotype Calling from Massively Parallel Sequencing (MPS) Data}, volume={5}, ISSN={1867-1764 1867-1772}, url={http://dx.doi.org/10.1007/S12561-012-9067-4}, DOI={10.1007/S12561-012-9067-4}, abstractNote={Massively parallel sequencing (MPS), since its debut in 2005, has transformed the field of genomic studies. These new sequencing technologies have resulted in the successful identification of causal variants for several rare Mendelian disorders. They have also begun to deliver on their promise to explain some of the missing heritability from genome-wide association studies (GWAS) of complex traits. We anticipate a rapidly growing number of MPS-based studies for a diverse range of applications in the near future. One crucial and nearly inevitable step is to detect SNPs and call genotypes at the detected polymorphic sites from the sequencing data. Here, we review statistical methods that have been proposed in the past five years for this purpose. In addition, we discuss emerging issues and future directions related to SNP detection and genotype calling from MPS data.}, number={1}, journal={Statistics in Biosciences}, publisher={Springer Science and Business Media LLC}, author={Li, Yun and Chen, Wei and Liu, Eric Yi and Zhou, Yi-Hui}, year={2013}, month={May}, pages={3–25} } @article{li_chen_liu_zhou_2012, title={Single Nucleotide Polymorphism (SNP) Detection and Genotype Calling from Massively Parallel Sequencing (MPS) Data}, journal={Statistics in Biosciences}, publisher={Springer}, author={Li, Yun and Chen, Wei and Liu, Eric Yi and Zhou, Yi-Hui}, year={2012}, pages={1–23} } @article{zhou_barry_wright_hill_2012, title={Supplementary material for: Empirical Pathway Analysis, Without Permutation}, author={Zhou, Yi-Hui and Barry, William T and Wright, Fred A and Hill, Chapel}, year={2012} } @article{zhou_xia_wright_2011, title={A powerful and flexible approach to the analysis of RNA sequence count data}, volume={27}, number={19}, journal={Bioinformatics}, publisher={Oxford University Press}, author={Zhou, Yi-Hui and Xia, Kai and Wright, Fred A}, year={2011}, pages={2672–2678} } @article{xia_shabalin_huang_madar_zhou_wang_zou_sun_sullivan_wright_2011, title={seeQTL: a searchable database for human eQTLs}, volume={28}, number={3}, journal={Bioinformatics}, publisher={Oxford University Press}, author={Xia, Kai and Shabalin, Andrey A and Huang, Shunping and Madar, Vered and Zhou, Yi-Hui and Wang, Wei and Zou, Fei and Sun, Wei and Sullivan, Patrick F and Wright, Fred A}, year={2011}, pages={451–452} } @article{zhou_gowda_2009, title={On the finiteness of the cone spectrum of certain linear transformations on Euclidean Jordan algebras}, volume={431}, ISSN={0024-3795}, url={http://dx.doi.org/10.1016/j.laa.2009.03.031}, DOI={10.1016/j.laa.2009.03.031}, abstractNote={Let L be a linear transformation on a finite dimensional real Hilbert space H and K be a closed convex cone with dual K∗ in H. The cone spectrum of L relative to K is the set of all real λ for which the linear complementarity problemx∈K,y=L(x)-λx∈K∗,and〈x,y〉=0admits a nonzero solution x. In the setting of a Euclidean Jordan algebra H and the corresponding symmetric cone K, we discuss the finiteness of the cone spectrum for Z-transformations and quadratic representations on H.}, number={5-7}, journal={Linear Algebra and its Applications}, publisher={Elsevier BV}, author={Zhou, Yihui and Gowda, M. Seetharama}, year={2009}, month={Aug}, pages={772–782} } @article{medical center, new york, ny 10032, usa. 18wellcome trust centre for human genetics research, nuffield department of clinical medicine, university of oxford, oxford ox3 7bn, uk. 19department of computer science, stanford university, stanford, ca 94305, usa. 20department of computer science, johns } @article{yin_zhou_asplund_athanassiadis_wideqvist_qiu_zhu_zhao_bergman, title={Significant Chlorinated Paraffin Concentrations Together with Traditional and Emerging Pollutants in Wildlife from a Yangtze River Delta Area Site}, author={Yin, Ge and Zhou, Yihui and Asplund, Lillemor and Athanassiadis, Ioannis and Wideqvist, Ulla and Qiu, Yanling and Zhu, Zhiliang and Zhao, Jianfu and Bergman, Åke} } @article{supplemental material physiologically based pharmacokinetic (pbpk) modeling of interstrain variability in trichloroethylene metabolism in the mouse } @article{zhou_wright, title={Supplementary Appendix to: Hypothesis testing at the extremes: fast and robust association for high-throughput data}, author={Zhou, Yi-Hui and Wright, Fred A} } @article{deluca_segrè_sullivan_young_gelfand_trowbridge_maller_tukiainen_lek_ward_et al., title={on July 8, 2015}, author={DeLuca, David S and Segrè, Ayellet V and Sullivan, Timothy J and Young, Taylor R and Gelfand, Ellen T and Trowbridge, Casandra A and Maller, Julian B and Tukiainen, Taru and Lek, Monkol and Ward, Lucas D and et al.} }