@article{van den broeck_bhosale_song_fonseca de lima_ashley_zhu_zhu_van de cotte_neyt_ortiz_et al._2023, title={Functional annotation of proteins for signaling network inference in non-model species}, volume={14}, ISSN={2041-1723}, url={http://dx.doi.org/10.1038/s41467-023-40365-z}, DOI={10.1038/s41467-023-40365-z}, abstractNote={AbstractMolecular biology aims to understand cellular responses and regulatory dynamics in complex biological systems. However, these studies remain challenging in non-model species due to poor functional annotation of regulatory proteins. To overcome this limitation, we develop a multi-layer neural network that determines protein functionality directly from the protein sequence. We annotate kinases and phosphatases in Glycine max. We use the functional annotations from our neural network, Bayesian inference principles, and high resolution phosphoproteomics to infer phosphorylation signaling cascades in soybean exposed to cold, and identify Glyma.10G173000 (TOI5) and Glyma.19G007300 (TOT3) as key temperature regulators. Importantly, the signaling cascade inference does not rely upon known kinase motifs or interaction data, enabling de novo identification of kinase-substrate interactions. Conclusively, our neural network shows generalization and scalability, as such we extend our predictions to Oryza sativa, Zea mays, Sorghum bicolor, and Triticum aestivum. Taken together, we develop a signaling inference approach for non-model species leveraging our predicted kinases and phosphatases.}, number={1}, journal={Nature Communications}, publisher={Springer Science and Business Media LLC}, author={Van den Broeck, Lisa and Bhosale, Dinesh Kiran and Song, Kuncheng and Fonseca de Lima, Cássio Flavio and Ashley, Michael and Zhu, Tingting and Zhu, Shanshuo and Van De Cotte, Brigitte and Neyt, Pia and Ortiz, Anna C. and et al.}, year={2023}, month={Aug} } @article{song_zhou_2023, title={Leveraging Scheme for Cross-Study Microbiome Machine Learning Prediction and Feature Evaluations}, volume={10}, ISSN={["2306-5354"]}, url={https://doi.org/10.3390/bioengineering10020231}, DOI={10.3390/bioengineering10020231}, abstractNote={The microbiota has proved to be one of the critical factors for many diseases, and researchers have been using microbiome data for disease prediction. However, models trained on one independent microbiome study may not be easily applicable to other independent studies due to the high level of variability in microbiome data. In this study, we developed a method for improving the generalizability and interpretability of machine learning models for predicting three different diseases (colorectal cancer, Crohn’s disease, and immunotherapy response) using nine independent microbiome datasets. Our method involves combining a smaller dataset with a larger dataset, and we found that using at least 25% of the target samples in the source data resulted in improved model performance. We determined random forest as our top model and employed feature selection to identify common and important taxa for disease prediction across the different studies. Our results suggest that this leveraging scheme is a promising approach for improving the accuracy and interpretability of machine learning models for predicting diseases based on microbiome data.}, number={2}, journal={BIOENGINEERING-BASEL}, author={Song, Kuncheng and Zhou, Yi-Hui}, year={2023}, month={Feb} } @article{song_zhou_2022, title={C3NA: correlation and consensus-based cross-taxonomy network analysis for compositional microbial data}, volume={23}, ISSN={["1471-2105"]}, DOI={10.1186/s12859-022-05027-9}, abstractNote={Abstract Background Studying the co-occurrence network structure of microbial samples is one of the critical approaches to understanding the perplexing and delicate relationship between the microbe, host, and diseases. It is also critical to develop a tool for investigating co-occurrence networks and differential abundance analyses to reveal the disease-related taxa–taxa relationship. In addition, it is also necessary to tighten the co-occurrence network into smaller modules to increase the ability for functional annotation and interpretability of  these taxa-taxa relationships.  Also, it is critical to retain the phylogenetic relationship among the taxa to identify differential abundance patterns, which can be used to resolve contradicting functions reported by different studies. Results In this article, we present Correlation and Consensus-based Cross-taxonomy Network Analysis (C3NA), a user-friendly R package for investigating compositional microbial sequencing data to identify and compare co-occurrence patterns across different taxonomic levels. C3NA contains two interactive graphic user interfaces (Shiny applications), one of them dedicated to the comparison between two diagnoses, e.g., disease versus control. We used C3NA to analyze two well-studied diseases, colorectal cancer, and Crohn’s disease. We discovered clusters of study and disease-dependent taxa that overlap with known functional taxa studied by other discovery studies and differential abundance analyses. Conclusion C3NA offers a new microbial data analyses pipeline for refined and enriched taxa–taxa co-occurrence network analyses, and the usability was further expanded via the built-in Shiny applications for interactive investigation. }, number={1}, journal={BMC BIOINFORMATICS}, author={Song, Kuncheng and Zhou, Yi-Hui}, year={2022}, month={Nov} } @misc{marvel_house_wheeler_song_zhou_wright_chiu_rusyn_motsinger-reif_reif_2021, title={The COVID-19 Pandemic Vulnerability Index (PVI) Dashboard: Monitoring County-Level Vulnerability Using Visualization, Statistical Modeling, and Machine Learning}, volume={129}, ISSN={["1552-9924"]}, url={http://www.scopus.com/inward/record.url?eid=2-s2.0-85099420902&partnerID=MN8TOARS}, DOI={10.1289/EHP8690}, abstractNote={Vol. 129, No. 1 Research LetterOpen AccessThe COVID-19 Pandemic Vulnerability Index (PVI) Dashboard: Monitoring County-Level Vulnerability Using Visualization, Statistical Modeling, and Machine Learning Skylar W. Marvel, John S. House, Matthew Wheeler, Kuncheng Song, Yi-Hui Zhou, Fred A. Wright, Weihsueh A. Chiu, Ivan Rusyn, Alison Motsinger-Reif, and David M. Reif Skylar W. Marvel Bioinformatics Research Center, Department of Biological Sciences, North Carolina State University (NCSU), Raleigh, North Carolina, USA , John S. House Biostatistics and Computational Biology Branch, National Institute of Environmental Health Sciences, National Institutes of Health, Department of Health and Human Services, Research Triangle Park, North Carolina, USA , Matthew Wheeler Biostatistics and Computational Biology Branch, National Institute of Environmental Health Sciences, National Institutes of Health, Department of Health and Human Services, Research Triangle Park, North Carolina, USA , Kuncheng Song Bioinformatics Research Center, Department of Biological Sciences, North Carolina State University (NCSU), Raleigh, North Carolina, USA , Yi-Hui Zhou Bioinformatics Research Center, Department of Biological Sciences, North Carolina State University (NCSU), Raleigh, North Carolina, USA , Fred A. Wright Bioinformatics Research Center, Department of Biological Sciences, North Carolina State University (NCSU), Raleigh, North Carolina, USA Department of Statistics, NCSU, Raleigh, North Carolina, USA , Weihsueh A. Chiu Veterinary Integrative Biosciences, College of Veterinary Medicine and Biomedical Sciences, Texas A&M University, College Station, Texas, USA , Ivan Rusyn Veterinary Integrative Biosciences, College of Veterinary Medicine and Biomedical Sciences, Texas A&M University, College Station, Texas, USA , Alison Motsinger-Reif Address correspondence to Alison Motsinger-Reif, 111 T.W. Alexander Dr., Rall Building, Research Triangle Park, NC 27709 USA. Email: E-mail Address: [email protected], or David M. Reif, Box 7566, 1 Lampe Dr., Raleigh NC 27695 USA. Email: E-mail Address: [email protected] Biostatistics and Computational Biology Branch, National Institute of Environmental Health Sciences, National Institutes of Health, Department of Health and Human Services, Research Triangle Park, North Carolina, USA , and David M. Reif Address correspondence to Alison Motsinger-Reif, 111 T.W. Alexander Dr., Rall Building, Research Triangle Park, NC 27709 USA. Email: E-mail Address: [email protected], or David M. Reif, Box 7566, 1 Lampe Dr., Raleigh NC 27695 USA. Email: E-mail Address: [email protected] Bioinformatics Research Center, Department of Biological Sciences, North Carolina State University (NCSU), Raleigh, North Carolina, USA Published:5 January 2021CID: 017701https://doi.org/10.1289/EHP8690AboutSectionsPDF ToolsDownload CitationsTrack Citations ShareShare onFacebookTwitterLinked InReddit IntroductionExpert groups have coalesced around a roadmap to address the current COVID-19 pandemic centered on social distancing, monitoring case counts and health care capacity, and, eventually, moving to pharmaceutical interventions. However, responsibility for navigating the pandemic response falls largely on state and local officials. To make equitable decisions on allocating resources, caring for vulnerable subpopulations, and implementing local- and state-level interventions, access to current pandemic data and key vulnerabilities at the community level are essential (National Academies of Sciences, Engineering, and Medicine 2020). Although numerous predictive models and interactive monitoring applications have been developed using pandemic-related data sets (Wynants et al. 2020), their capacity to aid in dynamic, community-level decision-making is limited. We developed the interactive COVID-19 Pandemic Vulnerability Index (PVI) Dashboard ( https://covid19pvi.niehs.nih.gov/) to address this need by presenting a visual synthesis of dynamic information at the county level to monitor disease trajectories, communicate local vulnerabilities, forecast key outcomes, and guide informed responses (Figure 1).Figure 1. COVID-19 PVI Dashboard. Dashboard screenshot displaying PVI profiles atop a choropleth map layer indicating overall COVID-19 PVI rank. The PVI Scorecard and associated data for Clarendon County, South Carolina, has been selected. The scorecard summarizes the overall PVI score and rank compared with all 3,142 U.S. counties on each indicator slice. The scrollable score distributions at left compare the selected county PVI to the distributions of overall and slice-wise scores across the United States. The panels below the map are populated with county-specific information on observed trends in cases and deaths, cumulative numbers for the county, historical timelines (for cumulative cases, cumulative deaths, PVI, and PVI rank), daily case and death counts for the most recent 14-d period, and a 14-d forecast of predicted cases and deaths. The information displayed for both observed COVID-19 data and PVI layers is scrollable back through March 2020. Documentation of additional features and usage, including advanced options (accessible via the collapsed menu at the upper left), is provided in a Quick Start Guide (linked at the upper right corner). Note: Pop, population; PVI, Pandemic Vulnerability Index.MethodsThe current PVI model integrates multiple data streams into an overall score derived from 12 key indicators—including well-established, general vulnerability factors for public health, plus emerging factors relevant to the pandemic—distributed across four domains: current infection rates, baseline population concentration, current interventions, and health and environmental vulnerabilities. The PVI profiles translate numerical results into visual representations, with each vulnerability factor represented as a component slice of a radar chart (Figure 2). The PVI profile for each county is calculated using the Toxicological Prioritization Index (ToxPi) framework for data integration within a geospatial context (Marvel et al. 2018; Bhandari et al. 2020). Data sources in the current model (version 11.2.1) include the Social Vulnerability Index (SVI) of the Centers for Disease Control and Prevention (CDC) for emergency response and hazard mitigation planning (Horney et al. 2017), testing rates from the COVID Tracking Project (Atlantic Monthly Group 2020), social distancing metrics from mobile device data ( https://www.unacast.com/covid19/social-distancing-scoreboard), and dynamic measures of disease spread and case numbers ( https://usafacts.org/issues/coronavirus/). Methodological details concerning the integration of data streams—plus the complete, daily time series of all source data since February 2020 and resultant PVI scores—are maintained on the public Github project page (COVID19PVI 2020). Over this period, the PVI has been strongly associated with key vulnerability-related outcome metrics (by rank-correlation), with updates of its performance assessment posted with model updates alongside data at the Github project page (COVID19PVI 2020).Figure 2. Translation of data into COVID-19 PVI profiles. Information from all 3,142 U.S. counties is translated into PVI slices. The illustration shows how air pollution data (average density of fine particulate matterPM2.5 per county) are compared for two example counties. The county with the higher relative measurement (County Y) has a longer air pollution slice than the county with a lower measurement (County X). This procedure is repeated for all slices, resulting in an integrated, overall PVI profile. Note: pop, population; PVI, Pandemic Vulnerability Index.In addition to the PVI itself—which is a summary, human-centric visualization of relative vulnerability drivers—the dashboard is supported by rigorous statistical modeling of the underlying data to enable quantitative analysis and provide short-term, local predictions of cases and deaths [complete methodological details are maintained at the Github project page (COVID19PVI 2020)]. Generalized linear models of cumulative outcome data indicated that, after population size, the most significant predictors were the proportion of Black residents, mean fine particulate matter [particulate matter less than or equal to 2.5 micrometers≤2.5μm in diameter (fine particulate matterPM2.5)], percentage of population with insurance coverage (which was positively associated), and proportion of Hispanic residents. The local predictions of cases and deaths (see the "Predictions" panel in Figure 1) are updated daily using a Bayesian spatiotemporal random-effects model to build forecasts up to 2 weeks out.DiscussionThe PVI Dashboard supports decision-making and dynamic monitoring in several ways. The display can be tailored to add or remove layers of information, filtered by region (e.g., all counties within a state) or clustered by profile shape similarity. The timelines for both PVI models and observed COVID-19 outcomes facilitate tracking the impact of interventions and directing local resource allocations. The "Predictions" panel (Figure 1) connects these historical numbers to local forecasts of cases and deaths. By communicating an integrated concept of vulnerability that considers both dynamic (infection rate and interventions) and static (community population and health care characteristics) drivers, the interactive dashboard can promote buy-in from diverse audiences, which is necessary for effective public health interventions. This messaging can assist in addressing known racial disparities in COVID-19 case and death rates (Tan et al. 2020) or populations, and the PVI Dashboard is part of the "Unique Populations" tab of the CDC's COVID-19 Data Tracker ( https://covid.cdc.gov/covid-data-tracker). By filtering the display to highlight vulnerability drivers within an overall score context, the dashboard can inform targeted interventions for specific localities.Unfortunately, the pandemic endures across the United States, with broad disparities based on the local environment (Tan et al. 2020). We present the PVI Dashboard as a dynamic container for contextualizing these disparities. It is a modular tool that will evolve to incorporate new data sources and analytics as they emerge (e.g., concurrent flu infections, school and business reopening statistics, heterogeneous public health practices). This flexibility positions it well as a resource for integrated prioritization of eventual vaccine distribution and monitoring its local impact. The PVI Dashboard can empower local and state officials to take informed action to combat the pandemic by communicating interactive, visual profiles of vulnerability atop an underlying statistical framework that enables the comparison of counties and the evaluation of the PVI's component data.AcknowledgmentsWe thank the information technology and web services staff at the National Institute of Environmental Health Sciences (NIEHS)/National Institutes of Health (NIH) for their help and support, as well as J.K. Cetina and D.J. Reif for their useful technical input and advice. This work was supported by NIEHS/NIH grants (P42 ES027704, P30 ES029067, P42 ES031009, and P30 ES025128) and NIEHS/NIH intramural funds (Z ES103352-01).ReferencesAtlantic Monthly Group.2020. The COVID Tracking Project. https://covidtracking.com/ [accessed 15 November 2020]. Google ScholarBhandari S, Lewis PGT, Craft E, Marvel SW, Reif DM, Chiu WA. 2020. HGBEnviroScreen: enabling community action through data integration in the Houston–Galveston–Brazoria region. Int J Environ Res Public Health 17(4):1130, PMID: 32053902, 10.3390/ijerph17041130. Crossref, Medline, Google ScholarCOVID19PVI.2020. COVID19PVI/data. https://github.com/COVID19PVI/data [accessed 15 November 2020]. Google ScholarHorney J, Nguyen M, Salvesen D, Dwyer C, Cooper J, Berke P. 2017. Assessing the quality of rural hazard mitigation plans in the southeastern United States. J Plan Educ Res 37(1):56–65, 10.1177/0739456X16628605. Crossref, Google ScholarMarvel SW, To K, Grimm FA, Wright FA, Rusyn I, Reif DM. 2018. ToxPi Graphical User Interface 2.0: dynamic exploration, visualization, and sharing of integrated data models. BMC Bioinformatics 19(1):80, PMID: 29506467, 10.1186/s12859-018-2089-2. Crossref, Medline, Google ScholarNational Academies of Sciences, Engineering, and Medicine.2020. Framework for Equitable Allocation of COVID-19 Vaccine. Gayle H, Foege W, Brown L, Kahn B, eds. Washington, DC: National Academies Press. Google ScholarTan TQ, Kullar R, Swartz TH, Mathew TA, Piggott DA, Berthaud V. 2020. Location matters: geographic disparities and impact of coronavirus disease 2019. J Infect Dis 222(12):1951–1954, PMID: 32942299, 10.1093/infdis/jiaa583. Crossref, Medline, Google ScholarWynants L, Van Calster B, Collins GS, Riley RD, Heinze G, Schuit E, et al.2020. Prediction models for diagnosis and prognosis of covid-19: systematic review and critical appraisal. BMJ 369:m1328, PMID: 32265220, 10.1136/bmj.m1328. Crossref, Medline, Google ScholarThe authors declare they have no actual or potential competing financial interests.FiguresReferencesRelatedDetails Vol. 129, No. 1 January 2021Metrics About Article Metrics Publication History Manuscript received20 November 2020Manuscript revised14 December 2020Manuscript accepted21 December 2020Originally published5 January 2021 Financial disclosuresPDF download License information EHP is an open-access journal published with support from the National Institute of Environmental Health Sciences, National Institutes of Health. All content is public domain unless otherwise noted. Note to readers with disabilities EHP strives to ensure that all journal content is accessible to all readers. However, some figures and Supplemental Material published in EHP articles may not conform to 508 standards due to the complexity of the information being presented. If you need assistance accessing journal content, please contact [email protected]. Our staff will work with you to assess and meet your accessibility needs within 3 working days.}, number={1}, journal={ENVIRONMENTAL HEALTH PERSPECTIVES}, author={Marvel, Skylar W. and House, John S. and Wheeler, Matthew and Song, Kuncheng and Zhou, Yi-Hui and Wright, Fred A. and Chiu, Weihsueh A. and Rusyn, Ivan and Motsinger-Reif, Alison and Reif, David M.}, year={2021}, month={Jan} } @article{song_wright_zhou_2020, title={Systematic Comparisons for Composition Profiles, Taxonomic Levels, and Machine Learning Methods for Microbiome-Based Disease Prediction}, volume={7}, ISSN={["2296-889X"]}, DOI={10.3389/fmolb.2020.610845}, abstractNote={Microbiome composition profiles generated from 16S rRNA sequencing have been extensively studied for their usefulness in phenotype trait prediction, including for complex diseases such as diabetes and obesity. These microbiome compositions have typically been quantified in the form of Operational Taxonomic Unit (OTU) count matrices. However, alternate approaches such as Amplicon Sequence Variants (ASV) have been used, as well as the direct use of k-mer sequence counts. The overall effect of these different types of predictors when used in concert with various machine learning methods has been difficult to assess, due to varied combinations described in the literature. Here we provide an in-depth investigation of more than 1,000 combinations of these three clustering/counting methods, in combination with varied choices for normalization and filtering, grouping at various taxonomic levels, and the use of more than ten commonly used machine learning methods for phenotype prediction. The use of short k-mers, which have computational advantages and conceptual simplicity, is shown to be effective as a source for microbiome-based prediction. Among machine-learning approaches, tree-based methods show consistent, though modest, advantages in prediction accuracy. We describe the various advantages and disadvantages of combinations in analysis approaches, and provide general observations to serve as a useful guide for future trait-prediction explorations using microbiome data.}, journal={FRONTIERS IN MOLECULAR BIOSCIENCES}, author={Song, Kuncheng and Wright, Fred A. and Zhou, Yi-Hui}, year={2020}, month={Dec} }