@article{davis_wiegers_sciaky_barkalow_strong_wyatt_wiegers_mcmorran_abrar_mattingly_2024, title={Comparative toxicogenomics database's 20th anniversary: update 2025}, volume={10}, ISSN={["1362-4962"]}, DOI={10.1093/nar/gkae883}, abstractNote={For 20 years, the Comparative Toxicogenomics Database (CTD; https://ctdbase.org) has provided high-quality, literature-based curated content describing how environmental chemicals affect human health. Today, CTD includes over 94 million toxicogenomic connections relating chemicals, genes/proteins, phenotypes, anatomical terms, diseases, comparative species, pathways and exposures. In this 20th year anniversary update, we reflect on CTD's remarkable growth and provide an overview of the increased data content and new features, including enhancements to the curation workflow (e.g. new exposure curation tool and expanded use of natural language processing), added functionality (e.g. improvements to CTD Tetramers and Pathway View tools) and significant upgrades to software and infrastructure. Linking lab-based core curation with real-world human exposure curation via the use of controlled vocabularies facilitates analysis of content across the entire environmental health continuum, from molecular toxicological mechanisms to the population level, and vice versa. The 'prototype database' originally described in 2004 has evolved into a premier, sophisticated, highly cited and well-engineered knowledgebase and discoverybase that is utilized by scientists worldwide to design testable hypotheses about environmental health.}, journal={NUCLEIC ACIDS RESEARCH}, author={Davis, Allan Peter and Wiegers, Thomas C. and Sciaky, Daniela and Barkalow, Fern and Strong, Melissa and Wyatt, Brent and Wiegers, Jolene and McMorran, Roy and Abrar, Sakib and Mattingly, Carolyn J.}, year={2024}, month={Oct} } @article{wyatt_davis_wiegers_wiegers_abrar_sciaky_barkalow_strong_mattingly_2024, title={Transforming environmental health datasets from the comparative toxicogenomics database into chord diagrams to visualize molecular mechanisms}, volume={6}, ISSN={["2673-3080"]}, DOI={10.3389/ftox.2024.1437884}, abstractNote={In environmental health, the specific molecular mechanisms connecting a chemical exposure to an adverse endpoint are often unknown, reflecting knowledge gaps. At the public Comparative Toxicogenomics Database (CTD; https://ctdbase.org/), we integrate manually curated, literature-based interactions from CTD to compute four-unit blocks of information organized as a potential step-wise molecular mechanism, known as "CGPD-tetramers," wherein a chemical interacts with a gene product to trigger a phenotype which can be linked to a disease. These computationally derived datasets can be used to fill the gaps and offer testable mechanistic information. Users can generate CGPD-tetramers for any combination of chemical, gene, phenotype, and/or disease of interest at CTD; however, such queries typically result in the generation of thousands of CGPD-tetramers. Here, we describe a novel approach to transform these large datasets into user-friendly chord diagrams using R. This visualization process is straightforward, simple to implement, and accessible to inexperienced users that have never used R before. Combining CGPD-tetramers into a single chord diagram helps identify potential key chemicals, genes, phenotypes, and diseases. This visualization allows users to more readily analyze computational datasets that can fill the exposure knowledge gaps in the environmental health continuum.}, journal={FRONTIERS IN TOXICOLOGY}, author={Wyatt, Brent and Davis, Allan Peter and Wiegers, Thomas C. and Wiegers, Jolene and Abrar, Sakib and Sciaky, Daniela and Barkalow, Fern and Strong, Melissa and Mattingly, Carolyn J.}, year={2024}, month={Jul} } @article{davis_wiegers_wiegers_wyatt_johnson_sciaky_barkalow_strong_planchart_mattingly_2023, title={CTD tetramers: a new online tool that computationally links curated chemicals, genes, phenotypes, and diseases to inform molecular mechanisms for environmental health}, volume={195}, ISSN={["1096-0929"]}, DOI={10.1093/toxsci/kfad069}, abstractNote={Abstract The molecular mechanisms connecting environmental exposures to adverse endpoints are often unknown, reflecting knowledge gaps. At the Comparative Toxicogenomics Database (CTD), we developed a bioinformatics approach that integrates manually curated, literature-based interactions from CTD to generate a “CGPD-tetramer”: a 4-unit block of information organized as a step-wise molecular mechanism linking an initiating Chemical, an interacting Gene, a Phenotype, and a Disease outcome. Here, we describe a novel, user-friendly tool called CTD Tetramers that generates these evidence-based CGPD-tetramers for any curated chemical, gene, phenotype, or disease of interest. Tetramers offer potential solutions for the unknown underlying mechanisms and intermediary phenotypes connecting a chemical exposure to a disease. Additionally, multiple tetramers can be assembled to construct detailed modes-of-action for chemical-induced disease pathways. As well, tetramers can help inform environmental influences on adverse outcome pathways (AOPs). We demonstrate the tool’s utility with relevant use cases for a variety of environmental chemicals (eg, perfluoroalkyl substances, bisphenol A), phenotypes (eg, apoptosis, spermatogenesis, inflammatory response), and diseases (eg, asthma, obesity, male infertility). Finally, we map AOP adverse outcome terms to corresponding CTD terms, allowing users to query for tetramers that can help augment AOP pathways with additional stressors, genes, and phenotypes, as well as formulate potential AOP disease networks (eg, liver cirrhosis and prostate cancer). This novel tool, as part of the complete suite of tools offered at CTD, provides users with computational datasets and their supporting evidence to potentially fill exposure knowledge gaps and develop testable hypotheses about environmental health.}, number={2}, journal={TOXICOLOGICAL SCIENCES}, author={Davis, Allan Peter and Wiegers, Thomas C. and Wiegers, Jolene and Wyatt, Brent and Johnson, Robin J. and Sciaky, Daniela and Barkalow, Fern and Strong, Melissa and Planchart, Antonio and Mattingly, Carolyn J.}, year={2023}, month={Sep}, pages={155–168} } @article{davis_wiegers_johnson_sciaky_wiegers_mattingly_2022, title={Comparative Toxicogenomics Database (CTD): update 2023}, volume={9}, ISSN={["1362-4962"]}, DOI={10.1093/nar/gkac833}, abstractNote={Abstract The Comparative Toxicogenomics Database (CTD; http://ctdbase.org/) harmonizes cross-species heterogeneous data for chemical exposures and their biological repercussions by manually curating and interrelating chemical, gene, phenotype, anatomy, disease, taxa, and exposure content from the published literature. This curated information is integrated to generate inferences, providing potential molecular mediators to develop testable hypotheses and fill in knowledge gaps for environmental health. This dual nature, acting as both a knowledgebase and a discoverybase, makes CTD a unique resource for the scientific community. Here, we report a 20% increase in overall CTD content for 17 100 chemicals, 54 300 genes, 6100 phenotypes, 7270 diseases and 202 000 exposure statements. We also present CTD Tetramers, a novel tool that computationally generates four-unit information blocks connecting a chemical, gene, phenotype, and disease to construct potential molecular mechanistic pathways. Finally, we integrate terms for human biological media used in the CTD Exposure module to corresponding CTD Anatomy pages, allowing users to survey the chemical profiles for any tissue-of-interest and see how these environmental biomarkers are related to phenotypes for any anatomical site. These, and other webpage visual enhancements, continue to promote CTD as a practical, user-friendly, and innovative resource for finding information and generating testable hypotheses about environmental health.}, journal={NUCLEIC ACIDS RESEARCH}, author={Davis, Allan Peter and Wiegers, Thomas C. and Johnson, Robin J. and Sciaky, Daniela and Wiegers, Jolene and Mattingly, Carolyn J.}, year={2022}, month={Sep} } @article{davis_wiegers_wiegers_grondin_johnson_sciaky_mattingly_2021, title={CTD anatomy: Analyzing chemical-induced phenotypes and exposures from an anatomical perspective, with implications for environmental health studies}, volume={2}, ISSN={["2666-027X"]}, DOI={10.1016/j.crtox.2021.03.001}, abstractNote={The Comparative Toxicogenomics Database (CTD) is a freely available public resource that curates and interrelates chemical, gene/protein, phenotype, disease, organism, and exposure data. CTD can be used to address toxicological mechanisms for environmental chemicals and facilitate the generation of testable hypotheses about how exposures affect human health. At CTD, manually curated interactions for chemical-induced phenotypes are enhanced with anatomy terms (tissues, fluids, and cell types) to describe the physiological system of the reported event. These same anatomy terms are used to annotate the human media (e.g., urine, hair, nail, blood, etc.) in which an environmental chemical was assayed for exposure. Currently, CTD uses more than 880 unique anatomy terms to contextualize over 255,000 chemical-phenotype interactions and 167,000 exposure statements. These annotations allow chemical-phenotype interactions and exposure data to be explored from a novel, anatomical perspective. Here, we describe CTD's anatomy curation process (including the construction of a controlled, interoperable vocabulary) and new anatomy webpages (that coalesce and organize the curated chemical-phenotype and exposure data sets). We also provide examples that demonstrate how this feature can be used to identify system- and cell-specific chemical-induced toxicities, help inform exposure data, prioritize phenotypes for environmental diseases, survey tissue and pregnancy exposomes, and facilitate data connections with external resources. Anatomy annotations advance understanding of environmental health by providing new ways to explore and survey chemical-induced events and exposure studies in the CTD framework.}, journal={CURRENT RESEARCH IN TOXICOLOGY}, author={Davis, Allan Peter and Wiegers, Thomas C. and Wiegers, Jolene and Grondin, Cynthia J. and Johnson, Robin J. and Sciaky, Daniela and Mattingly, Carolyn J.}, year={2021}, pages={128–139} } @article{grondin_davis_wiegers_wiegers_sciaky_johnson_mattingly_2021, title={Predicting molecular mechanisms, pathways, and health outcomes induced by Juul e-cigarette aerosol chemicals using the Comparative Toxicogenomics Database}, volume={2}, ISSN={["2666-027X"]}, DOI={10.1016/j.crtox.2021.08.001}, abstractNote={There is a critical need to understand the health risks associated with vaping e-cigarettes, which has reached epidemic levels among teens. Juul is currently the most popular type of e-cigarette on the market. Using the Comparative Toxicogenomics Database (CTD; http://ctdbase.org), a public resource that integrates chemical, gene, phenotype and disease data, we aimed to analyze the potential molecular mechanisms of eight chemicals detected in the aerosols generated by heating Juul e-cigarette pods: nicotine, acetaldehyde, formaldehyde, free radicals, crotonaldehyde, acetone, pyruvaldehyde, and particulate matter. Curated content in CTD, including chemical-gene, chemical-phenotype, and chemical-disease interactions, as well as associated phenotypes and pathway enrichment, were analyzed to help identify potential molecular mechanisms and diseases associated with vaping. Nicotine shows the most direct disease associations of these chemicals, followed by particulate matter and formaldehyde. Together, these chemicals show a direct marker or mechanistic relationship with 400 unique diseases in CTD, particularly in the categories of cardiovascular diseases, nervous system diseases, respiratory tract diseases, cancers, and mental disorders. We chose three respiratory tract diseases to investigate further, and found that in addition to cellular processes of apoptosis and cell proliferation, prioritized phenotypes underlying Juul-associated respiratory tract disease outcomes include response to oxidative stress, inflammatory response, and several cell signaling pathways (p38MAPK, NIK/NFkappaB, calcium-mediated).}, journal={CURRENT RESEARCH IN TOXICOLOGY}, author={Grondin, Cynthia J. and Davis, Allan Peter and Wiegers, Jolene A. and Wiegers, Thomas C. and Sciaky, Daniela and Johnson, Robin J. and Mattingly, Carolyn J.}, year={2021}, pages={272–281} } @article{pinkhasova_jameson_conrow_simeone_davis_wiegers_mattingly_leung_2021, title={Regulatory status of pesticide residues in cannabis: Implications to medical use in neurological diseases}, volume={2}, ISSN={["2666-027X"]}, DOI={10.1016/j.crtox.2021.02.007}, abstractNote={Medical cannabis represents a potential route of pesticide exposure to susceptible populations. We compared the qualifying conditions for medical use and pesticide testing requirements of cannabis in 33 states and Washington, D.C. Movement disorders were the most common neurological category of qualifying conditions, including epilepsy, certain symptoms of multiple sclerosis, Parkinson's Disease, and any cause of symptoms leading to seizures or spasticity. Different approaches of pesticide regulation were implemented in cannabis and cannabis-derived products. Six states imposed the strictest U.S. EPA tolerances (i.e. maximum residue levels) for food commodities on up to 400 pesticidal active ingredients in cannabis, while pesticide testing was optional in three states. Dimethomorph showed the largest variation in action levels, ranging from 0.1 to 60 ppm in 5 states. We evaluated the potential connections between insecticides, cannabinoids, and seizure using the Comparative Toxicogenomics Database. Twenty-two insecticides, two cannabinoids, and 63 genes were associated with 674 computationally generated chemical-gene-phenotype-disease (CGPD) tetramer constructs. Notable functional clusters included oxidation-reduction process (183 CGPD-tetramers), synaptic signaling pathways (151), and neuropeptide hormone activity (46). Cholinergic, dopaminergic, and retrograde endocannabinoid signaling pathways were linked to 10 genetic variants of epilepsy patients. Further research is needed to assess human health risk of cannabinoids and pesticides in support of a national standard for cannabis pesticide regulations.}, journal={CURRENT RESEARCH IN TOXICOLOGY}, author={Pinkhasova, Dorina V. and Jameson, Laura E. and Conrow, Kendra D. and Simeone, Michael P. and Davis, Allan Peter and Wiegers, Thomas C. and Mattingly, Carolyn J. and Leung, Maxwell C. K.}, year={2021}, pages={140–148} } @article{davis_grondin_johnson_sciaky_wiegers_wiegers_mattingly_2021, title={Comparative Toxicogenomics Database (CTD): update 2021}, volume={49}, ISSN={["1362-4962"]}, DOI={10.1093/nar/gkaa891}, abstractNote={Abstract The public Comparative Toxicogenomics Database (CTD; http://ctdbase.org/) is an innovative digital ecosystem that relates toxicological information for chemicals, genes, phenotypes, diseases, and exposures to advance understanding about human health. Literature-based, manually curated interactions are integrated to create a knowledgebase that harmonizes cross-species heterogeneous data for chemical exposures and their biological repercussions. In this biennial update, we report a 20% increase in CTD curated content and now provide 45 million toxicogenomic relationships for over 16 300 chemicals, 51 300 genes, 5500 phenotypes, 7200 diseases and 163 000 exposure events, from 600 comparative species. Furthermore, we increase the functionality of chemical–phenotype content with new data-tabs on CTD Disease pages (to help fill in knowledge gaps for environmental health) and new phenotype search parameters (for Batch Query and Venn analysis tools). As well, we introduce new CTD Anatomy pages that allow users to uniquely explore and analyze chemical–phenotype interactions from an anatomical perspective. Finally, we have enhanced CTD Chemical pages with new literature-based chemical synonyms (to improve querying) and added 1600 amino acid-based compounds (to increase chemical landscape). Together, these updates continue to augment CTD as a powerful resource for generating testable hypotheses about the etiologies and molecular mechanisms underlying environmentally influenced diseases.}, number={D1}, journal={NUCLEIC ACIDS RESEARCH}, author={Davis, Allan Peter and Grondin, Cynthia J. and Johnson, Robin J. and Sciaky, Daniela and Wiegers, Jolene and Wiegers, Thomas C. and Mattingly, Carolyn J.}, year={2021}, month={Jan}, pages={D1138–D1143} } @article{davis_wiegers_grondin_johnson_sciaky_wiegers_mattingly_2020, title={Leveraging the Comparative Toxicogenomics Database to Fill in Knowledge Gaps for Environmental Health: A Test Case for Air Pollution-induced Cardiovascular Disease}, volume={177}, ISSN={["1096-0929"]}, DOI={10.1093/toxsci/kfaa113}, abstractNote={Abstract Environmental health studies relate how exposures (eg, chemicals) affect human health and disease; however, in most cases, the molecular and biological mechanisms connecting an exposure with a disease remain unknown. To help fill in these knowledge gaps, we sought to leverage content from the public Comparative Toxicogenomics Database (CTD) to identify potential intermediary steps. In a proof-of-concept study, we systematically compute the genes, molecular mechanisms, and biological events for the environmental health association linking air pollution toxicants with 2 cardiovascular diseases (myocardial infarction and hypertension) as a test case. Our approach integrates 5 types of curated interactions in CTD to build sets of “CGPD-tetramers,” computationally constructed information blocks relating a Chemical- Gene interaction with a Phenotype and Disease. This bioinformatics strategy generates 653 CGPD-tetramers for air pollution-associated myocardial infarction (involving 5 pollutants, 58 genes, and 117 phenotypes) and 701 CGPD-tetramers for air pollution-associated hypertension (involving 3 pollutants, 96 genes, and 142 phenotypes). Collectively, we identify 19 genes and 96 phenotypes shared between these 2 air pollutant-induced outcomes, and suggest important roles for oxidative stress, inflammation, immune responses, cell death, and circulatory system processes. Moreover, CGPD-tetramers can be assembled into extensive chemical-induced disease pathways involving multiple gene products and sequential biological events, and many of these computed intermediary steps are validated in the literature. Our method does not require a priori knowledge of the toxicant, interacting gene, or biological system, and can be used to analyze any environmental chemical-induced disease curated within the public CTD framework. This bioinformatics strategy links and interrelates chemicals, genes, phenotypes, and diseases to fill in knowledge gaps for environmental health studies, as demonstrated for air pollution-associated cardiovascular disease, but can be adapted by researchers for any environmentally influenced disease-of-interest.}, number={2}, journal={TOXICOLOGICAL SCIENCES}, author={Davis, Allan Peter and Wiegers, Thomas C. and Grondin, Cynthia J. and Johnson, Robin J. and Sciaky, Daniela and Wiegers, Jolene and Mattingly, Carolyn J.}, year={2020}, month={Oct}, pages={392–404} } @article{grondin_davis_wiegers_wiegers_mattingly_2018, title={Accessing an Expanded Exposure Science Module at the Comparative Toxicogenomics Database}, volume={126}, ISSN={["1552-9924"]}, DOI={10.1289/ehp2873}, abstractNote={Summary: The Comparative Toxicogenomics Database (CTD; http://ctdbase.org) is a free resource that provides manually curated information on chemical, gene, phenotype, and disease relationships to advance understanding of the effect of environmental exposures on human health. Four core content areas are independently curated: chemical–gene interactions, chemical–disease and gene–disease associations, chemical–phenotype interactions, and environmental exposure data (e.g., effects of chemical stressors on humans). Since releasing exposure data in 2015, we have vastly increased our coverage of chemicals and disease/phenotype outcomes; greatly expanded access to exposure content; added search capability by stressors, cohorts, population demographics, and measured outcomes; and created user-specified displays of content. These enhancements aim to facilitate human studies by allowing comparisons among experimental parameters and across studies involving specified chemicals, populations, or outcomes. Integration of data among CTD’s four content areas and external data sets, such as Gene Ontology annotations and pathway information, links exposure data with over 1.8 million chemical–gene, chemical–disease and gene–disease interactions. Our analysis tools reveal direct and inferred relationships among the data and provide opportunities to generate predictive connections between environmental exposures and population-level health outcomes. https://doi.org/10.1289/EHP2873}, number={1}, journal={ENVIRONMENTAL HEALTH PERSPECTIVES}, author={Grondin, Cynthia J. and Davis, Allan Peter and Wiegers, Thomas C. and Wiegers, Jolene A. and Mattingly, Carolyn J.}, year={2018}, month={Jan} } @article{davis_wiegers_wiegers_johnson_sciaky_grondin_mattingly_2018, title={Chemical-Induced Phenotypes at CTD Help Inform the Predisease State and Construct Adverse Outcome Pathways}, volume={165}, ISSN={["1096-0929"]}, DOI={10.1093/toxsci/kfy131}, abstractNote={The Comparative Toxicogenomics Database (CTD; http://ctdbase.org) is a public resource that manually curates the scientific literature to provide content that illuminates the molecular mechanisms by which environmental exposures affect human health. We introduce our new chemical-phenotype module that describes how chemicals can affect molecular, cellular, and physiological phenotypes. At CTD, we operationally distinguish between phenotypes and diseases, wherein a phenotype refers to a nondisease biological event: eg, decreased cell cycle arrest (phenotype) versus liver cancer (disease), increased fat cell proliferation (phenotype) versus morbid obesity (disease), etc. Chemical-phenotype interactions are expressed in a formal structured notation using controlled terms for chemicals, phenotypes, taxon, and anatomical descriptors. Combining this information with CTD's chemical-disease module allows inferences to be made between phenotypes and diseases, yielding potential insight into the predisease state. Integration of all 4 CTD modules furnishes unique opportunities for toxicologists to generate computationally predictive adverse outcome pathways, linking chemical-gene molecular initiating events with phenotypic key events, adverse diseases, and population-level health outcomes. As examples, we present 3 diverse case studies discerning the effect of vehicle emissions on altered leukocyte migration, the role of cadmium in influencing phenotypes preceding Alzheimer disease, and the connection of arsenic-induced glucose metabolic phenotypes with diabetes. To date, CTD contains over 165 000 interactions that connect more than 6400 chemicals to 3900 phenotypes for 760 anatomical terms in 215 species, from over 19 000 scientific articles. To our knowledge, this is the first comprehensive set of manually curated, literature-based, contextualized, chemical-induced, nondisease phenotype data provided to the public.}, number={1}, journal={TOXICOLOGICAL SCIENCES}, author={Davis, Allan Peter and Wiegers, Thomas C. and Wiegers, Jolene and Johnson, Robin J. and Sciaky, Daniela and Grondin, Cynthia J. and Mattingly, Carolyn J.}, year={2018}, month={Sep}, pages={145–156} } @article{davis_grondin_johnson_sciaky_mcmorran_wiegers_wiegers_mattingly_2019, title={The Comparative Toxicogenomics Database: update 2019}, volume={47}, ISSN={["1362-4962"]}, DOI={10.1093/nar/gky868}, abstractNote={Abstract The Comparative Toxicogenomics Database (CTD; http://ctdbase.org/) is a premier public resource for literature-based, manually curated associations between chemicals, gene products, phenotypes, diseases, and environmental exposures. In this biennial update, we present our new chemical–phenotype module that codes chemical-induced effects on phenotypes, curated using controlled vocabularies for chemicals, phenotypes, taxa, and anatomical descriptors; this module provides unique opportunities to explore cellular and system-level phenotypes of the pre-disease state and allows users to construct predictive adverse outcome pathways (linking chemical–gene molecular initiating events with phenotypic key events, diseases, and population-level health outcomes). We also report a 46% increase in CTD manually curated content, which when integrated with other datasets yields more than 38 million toxicogenomic relationships. We describe new querying and display features for our enhanced chemical–exposure science module, providing greater scope of content and utility. As well, we discuss an updated MEDIC disease vocabulary with over 1700 new terms and accession identifiers. To accommodate these increases in data content and functionality, CTD has upgraded its computational infrastructure. These updates continue to improve CTD and help inform new testable hypotheses about the etiology and mechanisms underlying environmentally influenced diseases.}, number={D1}, journal={NUCLEIC ACIDS RESEARCH}, author={Davis, Allan Peter and Grondin, Cynthia J. and Johnson, Robin J. and Sciaky, Daniela and McMorran, Roy and Wiegers, Jolene and Wiegers, Thomas C. and Mattingly, Carolyn J.}, year={2019}, month={Jan}, pages={D948–D954} } @article{grondin_davis_wiegers_king_wiegers_reif_hoppin_mattingly_2016, title={Advancing Exposure Science through Chemical Data Curation and Integration in the Comparative Toxicogenomics Database}, volume={124}, ISSN={0091-6765 1552-9924}, url={http://dx.doi.org/10.1289/EHP174}, DOI={10.1289/ehp174}, abstractNote={Background: Exposure science studies the interactions and outcomes between environmental stressors and human or ecological receptors. To augment its role in understanding human health and the exposome, we aimed to centralize and integrate exposure science data into the broader biological framework of the Comparative Toxicogenomics Database (CTD), a public resource that promotes understanding of environmental chemicals and their effects on human health. Objectives: We integrated exposure data within the CTD to provide a centralized, freely available resource that facilitates identification of connections between real-world exposures, chemicals, genes/proteins, diseases, biological processes, and molecular pathways. Methods: We developed a manual curation paradigm that captures exposure data from the scientific literature using controlled vocabularies and free text within the context of four primary exposure concepts: stressor, receptor, exposure event, and exposure outcome. Using data from the Agricultural Health Study, we have illustrated the benefits of both centralization and integration of exposure information with CTD core data. Results: We have described our curation process, demonstrated how exposure data can be accessed and analyzed in the CTD, and shown how this integration provides a broad biological context for exposure data to promote mechanistic understanding of environmental influences on human health. Conclusions: Curation and integration of exposure data within the CTD provides researchers with new opportunities to correlate exposures with human health outcomes, to identify underlying potential molecular mechanisms, and to improve understanding about the exposome. Citation: Grondin CJ, Davis AP, Wiegers TC, King BL, Wiegers JA, Reif DM, Hoppin JA, Mattingly CJ. 2016. Advancing exposure science through chemical data curation and integration in the Comparative Toxicogenomics Database. Environ Health Perspect 124:1592–1599; http://dx.doi.org/10.1289/EHP174}, number={10}, journal={Environmental Health Perspectives}, publisher={Environmental Health Perspectives}, author={Grondin, Cynthia J. and Davis, Allan Peter and Wiegers, Thomas C. and King, Benjamin L. and Wiegers, Jolene A. and Reif, David M. and Hoppin, Jane A. and Mattingly, Carolyn J.}, year={2016}, month={Oct}, pages={1592–1599} } @article{wei_peng_leaman_davis_mattingly_li_wiegers_lu_2016, title={Assessing the state of the art in biomedical relation extraction: overview of the BioCreative V chemical-disease relation (CDR) task}, ISSN={["1758-0463"]}, DOI={10.1093/database/baw032}, abstractNote={Manually curating chemicals, diseases and their relationships is significantly important to biomedical research, but it is plagued by its high cost and the rapid growth of the biomedical literature. In recent years, there has been a growing interest in developing computational approaches for automatic chemical-disease relation (CDR) extraction. Despite these attempts, the lack of a comprehensive benchmarking dataset has limited the comparison of different techniques in order to assess and advance the current state-of-the-art. To this end, we organized a challenge task through BioCreative V to automatically extract CDRs from the literature. We designed two challenge tasks: disease named entity recognition (DNER) and chemical-induced disease (CID) relation extraction. To assist system development and assessment, we created a large annotated text corpus that consisted of human annotations of chemicals, diseases and their interactions from 1500 PubMed articles. 34 teams worldwide participated in the CDR task: 16 (DNER) and 18 (CID). The best systems achieved an F-score of 86.46% for the DNER task—a result that approaches the human inter-annotator agreement (0.8875)—and an F-score of 57.03% for the CID task, the highest results ever reported for such tasks. When combining team results via machine learning, the ensemble system was able to further improve over the best team results by achieving 88.89% and 62.80% in F-score for the DNER and CID task, respectively. Additionally, another novel aspect of our evaluation is to test each participating system’s ability to return real-time results: the average response time for each team’s DNER and CID web service systems were 5.6 and 9.3 s, respectively. Most teams used hybrid systems for their submissions based on machining learning. Given the level of participation and results, we found our task to be successful in engaging the text-mining research community, producing a large annotated corpus and improving the results of automatic disease recognition and CDR extraction. Database URL: http://www.biocreative.org/tasks/biocreative-v/track-3-cdr/}, journal={DATABASE-THE JOURNAL OF BIOLOGICAL DATABASES AND CURATION}, author={Wei, Chih-Hsuan and Peng, Yifan and Leaman, Robert and Davis, Allan Peter and Mattingly, Carolyn J. and Li, Jiao and Wiegers, Thomas C. and Lu, Zhiyong}, year={2016}, month={Mar} } @article{li_sun_johnson_sciaky_wei_leaman_davis_mattingly_wiegers_lu_et al._2016, title={BioCreative V CDR task corpus: a resource for chemical disease relation extraction}, ISSN={["1758-0463"]}, DOI={10.1093/database/baw068}, abstractNote={Community-run, formal evaluations and manually annotated text corpora are critically important for advancing biomedical text-mining research. Recently in BioCreative V, a new challenge was organized for the tasks of disease named entity recognition (DNER) and chemical-induced disease (CID) relation extraction. Given the nature of both tasks, a test collection is required to contain both disease/chemical annotations and relation annotations in the same set of articles. Despite previous efforts in biomedical corpus construction, none was found to be sufficient for the task. Thus, we developed our own corpus called BC5CDR during the challenge by inviting a team of Medical Subject Headings (MeSH) indexers for disease/chemical entity annotation and Comparative Toxicogenomics Database (CTD) curators for CID relation annotation. To ensure high annotation quality and productivity, detailed annotation guidelines and automatic annotation tools were provided. The resulting BC5CDR corpus consists of 1500 PubMed articles with 4409 annotated chemicals, 5818 diseases and 3116 chemical-disease interactions. Each entity annotation includes both the mention text spans and normalized concept identifiers, using MeSH as the controlled vocabulary. To ensure accuracy, the entities were first captured independently by two annotators followed by a consensus annotation: The average inter-annotator agreement (IAA) scores were 87.49% and 96.05% for the disease and chemicals, respectively, in the test set according to the Jaccard similarity coefficient. Our corpus was successfully used for the BioCreative V challenge tasks and should serve as a valuable resource for the text-mining research community. Database URL: http://www.biocreative.org/tasks/biocreative-v/track-3-cdr/}, journal={DATABASE-THE JOURNAL OF BIOLOGICAL DATABASES AND CURATION}, author={Li, J. and Sun, Y. P. and Johnson, R. J. and Sciaky, D. and Wei, C. H. and Leaman, R. and Davis, A. P. and Mattingly, Carolyn and Wiegers, T. C. and Lu, Z. Y. and et al.}, year={2016}, month={May} } @article{davis_wiegers_king_wiegers_grondin_sciaky_johnson_mattingly_2016, title={Generating Gene Ontology-Disease Inferences to Explore Mechanisms of Human Disease at the Comparative Toxicogenomics Database}, volume={11}, ISSN={["1932-6203"]}, DOI={10.1371/journal.pone.0155530}, abstractNote={Strategies for discovering common molecular events among disparate diseases hold promise for improving understanding of disease etiology and expanding treatment options. One technique is to leverage curated datasets found in the public domain. The Comparative Toxicogenomics Database (CTD; http://ctdbase.org/) manually curates chemical-gene, chemical-disease, and gene-disease interactions from the scientific literature. The use of official gene symbols in CTD interactions enables this information to be combined with the Gene Ontology (GO) file from NCBI Gene. By integrating these GO-gene annotations with CTD’s gene-disease dataset, we produce 753,000 inferences between 15,700 GO terms and 4,200 diseases, providing opportunities to explore presumptive molecular underpinnings of diseases and identify biological similarities. Through a variety of applications, we demonstrate the utility of this novel resource. As a proof-of-concept, we first analyze known repositioned drugs (e.g., raloxifene and sildenafil) and see that their target diseases have a greater degree of similarity when comparing GO terms vs. genes. Next, a computational analysis predicts seemingly non-intuitive diseases (e.g., stomach ulcers and atherosclerosis) as being similar to bipolar disorder, and these are validated in the literature as reported co-diseases. Additionally, we leverage other CTD content to develop testable hypotheses about thalidomide-gene networks to treat seemingly disparate diseases. Finally, we illustrate how CTD tools can rank a series of drugs as potential candidates for repositioning against B-cell chronic lymphocytic leukemia and predict cisplatin and the small molecule inhibitor JQ1 as lead compounds. The CTD dataset is freely available for users to navigate pathologies within the context of extensive biological processes, molecular functions, and cellular components conferred by GO. This inference set should aid researchers, bioinformaticists, and pharmaceutical drug makers in finding commonalities in disease mechanisms, which in turn could help identify new therapeutics, new indications for existing pharmaceuticals, potential disease comorbidities, and alerts for side effects.}, number={5}, journal={PLOS ONE}, author={Davis, Allan Peter and Wiegers, Thomas C. and King, Benjamin L. and Wiegers, Jolene and Grondin, Cynthia J. and Sciaky, Daniela and Johnson, Robin J. and Mattingly, Carolyn J.}, year={2016}, month={May} } @article{davis_grondin_johnson_sciaky_king_mcmorran_wiegers_wiegers_mattingly_2017, title={The Comparative Toxicogenomics Database: update 2017}, volume={45}, ISSN={["1362-4962"]}, DOI={10.1093/nar/gkw838}, abstractNote={The Comparative Toxicogenomics Database (CTD; http://ctdbase.org/) provides information about interactions between chemicals and gene products, and their relationships to diseases. Core CTD content (chemical-gene, chemical-disease and gene-disease interactions manually curated from the literature) are integrated with each other as well as with select external datasets to generate expanded networks and predict novel associations. Today, core CTD includes more than 30.5 million toxicogenomic connections relating chemicals/drugs, genes/proteins, diseases, taxa, Gene Ontology (GO) annotations, pathways, and gene interaction modules. In this update, we report a 33% increase in our core data content since 2015, describe our new exposure module (that harmonizes exposure science information with core toxicogenomic data) and introduce a novel dataset of GO-disease inferences (that identify common molecular underpinnings for seemingly unrelated pathologies). These advancements centralize and contextualize real-world chemical exposures with molecular pathways to help scientists generate testable hypotheses in an effort to understand the etiology and mechanisms underlying environmentally influenced diseases.}, number={D1}, journal={NUCLEIC ACIDS RESEARCH}, author={Davis, Allan Peter and Grondin, Cynthia J. and Johnson, Robin J. and Sciaky, Daniela and King, Benjamin L. and McMorran, Roy and Wiegers, Jolene and Wiegers, Thomas C. and Mattingly, Carolyn J.}, year={2017}, month={Jan}, pages={D972–D978} } @article{pelletier_wiegers_enayetallah_kibbey_gosink_koza-taylor_mattingly_lawton_2016, title={ToxEvaluator: an integrated computational platform to aid the interpretation of toxicology study-related findings}, ISSN={["1758-0463"]}, DOI={10.1093/database/baw062}, abstractNote={Attempts are frequently made to investigate adverse findings from preclinical toxicology studies in order to better understand underlying toxicity mechanisms. These efforts often begin with limited information, including a description of the adverse finding, knowledge of the structure of the chemical associated with its cause and the intended pharmacological target. ToxEvaluator was developed jointly by Pfizer and the Comparative Toxicogenomics Database (http://ctdbase.org) team at North Carolina State University as an in silico platform to facilitate interpretation of toxicity findings in light of prior knowledge. Through the integration of a diverse set of in silico tools that leverage a number of public and proprietary databases, ToxEvaluator streamlines the process of aggregating and interrogating diverse sources of information. The user enters compound and target identifiers, and selects adverse event descriptors from a safety lexicon and mapped MeSH disease terms. ToxEvaluator provides a summary report with multiple distinct areas organized according to what target or structural aspects have been linked to the adverse finding, including primary pharmacology, structurally similar proprietary compounds, structurally similar public domain compounds, predicted secondary (i.e. off-target) pharmacology and known secondary pharmacology. Similar proprietary compounds and their associated in vivo toxicity findings are reported, along with a link to relevant supporting documents. For similar public domain compounds and interacting targets, ToxEvaluator integrates relationships curated in Comparative Toxicogenomics Database, returning all direct and inferred linkages between them. As an example of its utility, we demonstrate how ToxEvaluator rapidly identified direct (primary pharmacology) and indirect (secondary pharmacology) linkages between cerivastatin and myopathy.}, journal={DATABASE-THE JOURNAL OF BIOLOGICAL DATABASES AND CURATION}, author={Pelletier, D. and Wiegers, T. C. and Enayetallah, A. and Kibbey, C. and Gosink, M. and Koza-Taylor, P. and Mattingly, C. J. and Lawton, M.}, year={2016}, month={May} } @article{comeau_batista-navarro_dai_dogan_yepes_khare_lu_marques_mattingly_neves_et al._2014, title={BioC interoperability track overview}, ISSN={["1758-0463"]}, DOI={10.1093/database/bau053}, abstractNote={BioC is a new simple XML format for sharing biomedical text and annotations and libraries to read and write that format. This promotes the development of interoperable tools for natural language processing (NLP) of biomedical text. The interoperability track at the BioCreative IV workshop featured contributions using or highlighting the BioC format. These contributions included additional implementations of BioC, many new corpora in the format, biomedical NLP tools consuming and producing the format and online services using the format. The ease of use, broad support and rapidly growing number of tools demonstrate the need for and value of the BioC format. Database URL: http://bioc.sourceforge.net/}, journal={DATABASE-THE JOURNAL OF BIOLOGICAL DATABASES AND CURATION}, author={Comeau, Donald C. and Batista-Navarro, Riza Theresa and Dai, Hong-Jie and Dogan, Rezarta Islamaj and Yepes, Antonio Jimeno and Khare, Ritu and Lu, Zhiyong and Marques, Hernani and Mattingly, Carolyn J. and Neves, Mariana and et al.}, year={2014}, month={Jun} } @article{arighi_wu_cohen_hirschman_krallinger_valencia_lu_wilbur_wiegers_2014, title={BioCreative-IV virtual issue}, ISSN={["1758-0463"]}, DOI={10.1093/database/bau039}, abstractNote={BioCreative: Critical Assessment of Information Extraction in Biology is an international community-wide effort for evaluating text mining (TM) and information extraction systems applied to the biological domain (http://www.biocreative.org/).The Challenge Evaluations and the accompanying BioCreative Workshops bring together the TM and biology communities to drive the development of practically relevant TM systems. One of the main goals of this initiative is that the resulting systems facilitate a more efficient literature information access to biologists in general, but also provide tools that can be directly integrated into the biocuration workflow and the knowledge discovery process carried out by databases. Beyond addressing the current barriers faced by TM technologies applied to biological literature, BioCreative has further been conducting user requirement analyses, user-based evaluations and fostering standards development for TM tool reuse and integration. This DATABASE virtual issue captures the major results from the Fourth BioCreative Challenge Evaluation Workshop, and is the sixth special issue devoted to BioCreative. Built on the success of the previous Challenge Evaluations and Workshops (BioCreative I, II, II.5, III, 2012) (1–5), the BioCreative IV Workshop was held in Bethesda, MD, on October 7–9, 2013.}, journal={DATABASE-THE JOURNAL OF BIOLOGICAL DATABASES AND CURATION}, author={Arighi, Cecilia N. and Wu, Cathy H. and Cohen, Kevin B. and Hirschman, Lynette and Krallinger, Martin and Valencia, Alfonso and Lu, Zhiyong and Wilbur, John W. and Wiegers, Thomas C.}, year={2014}, month={May} } @article{davis_grondin_lennon-hopkins_saraceni-richards_sciaky_king_wiegers_mattingly_2015, title={The Comparative Toxicogenomics Database's 10th year anniversary: update 2015}, volume={43}, ISSN={["1362-4962"]}, DOI={10.1093/nar/gku935}, abstractNote={Ten years ago, the Comparative Toxicogenomics Database (CTD; http://ctdbase.org/) was developed out of a need to formalize, harmonize and centralize the information on numerous genes and proteins responding to environmental toxic agents across diverse species. CTD's initial approach was to facilitate comparisons of nucleotide and protein sequences of toxicologically significant genes by curating these sequences and electronically annotating them with chemical terms from their associated references. Since then, however, CTD has vastly expanded its scope to robustly represent a triad of chemical–gene, chemical–disease and gene–disease interactions that are manually curated from the scientific literature by professional biocurators using controlled vocabularies, ontologies and structured notation. Today, CTD includes 24 million toxicogenomic connections relating chemicals/drugs, genes/proteins, diseases, taxa, phenotypes, Gene Ontology annotations, pathways and interaction modules. In this 10th year anniversary update, we outline the evolution of CTD, including our increased data content, new ‘Pathway View’ visualization tool, enhanced curation practices, pilot chemical–phenotype results and impending exposure data set. The prototype database originally described in our first report has transformed into a sophisticated resource used actively today to help scientists develop and test hypotheses about the etiologies of environmentally influenced diseases.}, number={D1}, journal={NUCLEIC ACIDS RESEARCH}, author={Davis, Allan Peter and Grondin, Cynthia J. and Lennon-Hopkins, Kelley and Saraceni-Richards, Cynthia and Sciaky, Daniela and King, Benjamin L. and Wiegers, Thomas C. and Mattingly, Carolyn J.}, year={2015}, month={Jan}, pages={D914–D920} } @article{wiegers_davis_mattingly_2014, title={Web services-based text-mining demonstrates broad impacts for interoperability and process simplification}, ISSN={["1758-0463"]}, DOI={10.1093/database/bau050}, abstractNote={The Critical Assessment of Information Extraction systems in Biology (BioCreAtIvE) challenge evaluation tasks collectively represent a community-wide effort to evaluate a variety of text-mining and information extraction systems applied to the biological domain. The BioCreative IV Workshop included five independent subject areas, including Track 3, which focused on named-entity recognition (NER) for the Comparative Toxicogenomics Database (CTD; http://ctdbase.org). Previously, CTD had organized document ranking and NER-related tasks for the BioCreative Workshop 2012; a key finding of that effort was that interoperability and integration complexity were major impediments to the direct application of the systems to CTD's text-mining pipeline. This underscored a prevailing problem with software integration efforts. Major interoperability-related issues included lack of process modularity, operating system incompatibility, tool configuration complexity and lack of standardization of high-level inter-process communications. One approach to potentially mitigate interoperability and general integration issues is the use of Web services to abstract implementation details; rather than integrating NER tools directly, HTTP-based calls from CTD's asynchronous, batch-oriented text-mining pipeline could be made to remote NER Web services for recognition of specific biological terms using BioC (an emerging family of XML formats) for inter-process communications. To test this concept, participating groups developed Representational State Transfer /BioC-compliant Web services tailored to CTD's NER requirements. Participants were provided with a comprehensive set of training materials. CTD evaluated results obtained from the remote Web service-based URLs against a test data set of 510 manually curated scientific articles. Twelve groups participated in the challenge. Recall, precision, balanced F-scores and response times were calculated. Top balanced F-scores for gene, chemical and disease NER were 61, 74 and 51%, respectively. Response times ranged from fractions-of-a-second to over a minute per article. We present a description of the challenge and summary of results, demonstrating how curation groups can effectively use interoperable NER technologies to simplify text-mining pipeline implementation. Database URL: http://ctdbase.org/}, journal={DATABASE-THE JOURNAL OF BIOLOGICAL DATABASES AND CURATION}, author={Wiegers, Thomas C. and Davis, Allan Peter and Mattingly, Carolyn J.}, year={2014}, month={Jun} } @article{davis_wiegers_roberts_king_lay_lennon-hopkins_sciaky_johnson_keating_greene_et al._2013, title={A CTD-Pfizer collaboration: manual curation of 88 000 scientific articles text mined for drug-disease and drug-phenotype interactions}, ISSN={["1758-0463"]}, DOI={10.1093/database/bat080}, abstractNote={Improving the prediction of chemical toxicity is a goal common to both environmental health research and pharmaceutical drug development. To improve safety detection assays, it is critical to have a reference set of molecules with well-defined toxicity annotations for training and validation purposes. Here, we describe a collaboration between safety researchers at Pfizer and the research team at the Comparative Toxicogenomics Database (CTD) to text mine and manually review a collection of 88 629 articles relating over 1 200 pharmaceutical drugs to their potential involvement in cardiovascular, neurological, renal and hepatic toxicity. In 1 year, CTD biocurators curated 2 54 173 toxicogenomic interactions (1 52 173 chemical–disease, 58 572 chemical–gene, 5 345 gene–disease and 38 083 phenotype interactions). All chemical–gene–disease interactions are fully integrated with public CTD, and phenotype interactions can be downloaded. We describe Pfizer’s text-mining process to collate the articles, and CTD’s curation strategy, performance metrics, enhanced data content and new module to curate phenotype information. As well, we show how data integration can connect phenotypes to diseases. This curation can be leveraged for information about toxic endpoints important to drug safety and help develop testable hypotheses for drug–disease events. The availability of these detailed, contextualized, high-quality annotations curated from seven decades’ worth of the scientific literature should help facilitate new mechanistic screening assays for pharmaceutical compound survival. This unique partnership demonstrates the importance of resource sharing and collaboration between public and private entities and underscores the complementary needs of the environmental health science and pharmaceutical communities. Database URL: http://ctdbase.org/}, journal={DATABASE-THE JOURNAL OF BIOLOGICAL DATABASES AND CURATION}, author={Davis, Allan Peter and Wiegers, Thomas C. and Roberts, Phoebe M. and King, Benjamin L. and Lay, Jean M. and Lennon-Hopkins, Kelley and Sciaky, Daniela and Johnson, Robin and Keating, Heather and Greene, Nigel and et al.}, year={2013}, month={Nov} } @article{comeau_dogan_ciccarese_cohen_krallinger_leitner_lu_peng_rinaldi_torii_et al._2013, title={BioC: a minimalist approach to interoperability for biomedical text processing}, ISSN={["1758-0463"]}, DOI={10.1093/database/bat064}, abstractNote={A vast amount of scientific information is encoded in natural language text, and the quantity of such text has become so great that it is no longer economically feasible to have a human as the first step in the search process. Natural language processing and text mining tools have become essential to facilitate the search for and extraction of information from text. This has led to vigorous research efforts to create useful tools and to create humanly labeled text corpora, which can be used to improve such tools. To encourage combining these efforts into larger, more powerful and more capable systems, a common interchange format to represent, store and exchange the data in a simple manner between different language processing systems and text mining tools is highly desirable. Here we propose a simple extensible mark-up language format to share text documents and annotations. The proposed annotation approach allows a large number of different annotations to be represented including sentences, tokens, parts of speech, named entities such as genes or diseases and relationships between named entities. In addition, we provide simple code to hold this data, read it from and write it back to extensible mark-up language files and perform some sample processing. We also describe completed as well as ongoing work to apply the approach in several directions. Code and data are available at http://bioc.sourceforge.net/. Database URL: http://bioc.sourceforge.net/}, journal={DATABASE-THE JOURNAL OF BIOLOGICAL DATABASES AND CURATION}, author={Comeau, Donald C. and Dogan, Rezarta Islamaj and Ciccarese, Paolo and Cohen, Kevin Bretonnel and Krallinger, Martin and Leitner, Florian and Lu, Zhiyong and Peng, Yifan and Rinaldi, Fabio and Torii, Manabu and et al.}, year={2013}, month={Sep} } @article{davis_wiegers_johnson_lay_lennon-hopkins_saraceni-richards_sciaky_murphy_mattingly_2013, title={Text Mining Effectively Scores and Ranks the Literature for Improving Chemical-Gene-Disease Curation at the Comparative Toxicogenomics Database}, volume={8}, ISSN={["1932-6203"]}, DOI={10.1371/journal.pone.0058201}, abstractNote={The Comparative Toxicogenomics Database (CTD; http://ctdbase.org/) is a public resource that curates interactions between environmental chemicals and gene products, and their relationships to diseases, as a means of understanding the effects of environmental chemicals on human health. CTD provides a triad of core information in the form of chemical-gene, chemical-disease, and gene-disease interactions that are manually curated from scientific articles. To increase the efficiency, productivity, and data coverage of manual curation, we have leveraged text mining to help rank and prioritize the triaged literature. Here, we describe our text-mining process that computes and assigns each article a document relevancy score (DRS), wherein a high DRS suggests that an article is more likely to be relevant for curation at CTD. We evaluated our process by first text mining a corpus of 14,904 articles triaged for seven heavy metals (cadmium, cobalt, copper, lead, manganese, mercury, and nickel). Based upon initial analysis, a representative subset corpus of 3,583 articles was then selected from the 14,094 articles and sent to five CTD biocurators for review. The resulting curation of these 3,583 articles was analyzed for a variety of parameters, including article relevancy, novel data content, interaction yield rate, mean average precision, and biological and toxicological interpretability. We show that for all measured parameters, the DRS is an effective indicator for scoring and improving the ranking of literature for the curation of chemical-gene-disease information at CTD. Here, we demonstrate how fully incorporating text mining-based DRS scoring into our curation pipeline enhances manual curation by prioritizing more relevant articles, thereby increasing data content, productivity, and efficiency.}, number={4}, journal={PLOS ONE}, author={Davis, Allan Peter and Wiegers, Thomas C. and Johnson, Robin J. and Lay, Jean M. and Lennon-Hopkins, Kelley and Saraceni-Richards, Cynthia and Sciaky, Daniela and Murphy, Cynthia Grondin and Mattingly, Carolyn J.}, year={2013}, month={Apr} } @article{wu_arighi_cohen_hirschman_krallinger_lu_mattingly_valencia_wiegers_wilbur_et al._2012, title={BioCreative-2012 Virtual Issue}, ISSN={["1758-0463"]}, DOI={10.1093/database/bas049}, abstractNote={BioCreative: Critical Assessment of Information Extraction in Biology is an international community-wide effort for evaluating text mining and information extraction systems applied to the biological domain (http://www.biocreative.org/). The Challenge Evaluations and the accompanying BioCreative Workshops bring together the text mining and biology communities to drive the development of text mining systems that can be integrated into the biocuration workflow and the knowledge discovery process. To address the current barriers in using text mining in biology, BioCreative has further been conducting user requirement analysis, user-based evaluations and fostering standard development for text mining tool re-use and integration. This DATABASE virtual issue captures the major results from the BioCreative-2012 Workshop on Interactive Text Mining in the Biocuration Workflow and is the fifth special issue devoted to BioCreative.}, journal={DATABASE-THE JOURNAL OF BIOLOGICAL DATABASES AND CURATION}, author={Wu, C. H. and Arighi, C. N. and Cohen, K. B. and Hirschman, L. and Krallinger, M. and Lu, Z. Y. and Mattingly, Carolyn and Valencia, A. and Wiegers, T. C. and Wilbur, W. J. and et al.}, year={2012}, month={Dec} } @article{wiegers_davis_mattingly_2012, title={Collaborative biocuration--text-mining development task for document prioritization for curation}, volume={2012}, ISSN={1758-0463}, url={http://dx.doi.org/10.1093/database/bas037}, DOI={10.1093/database/bas037}, abstractNote={The Critical Assessment of Information Extraction systems in Biology (BioCreAtIvE) challenge evaluation is a community-wide effort for evaluating text mining and information extraction systems for the biological domain. The ‘BioCreative Workshop 2012’ subcommittee identified three areas, or tracks, that comprised independent, but complementary aspects of data curation in which they sought community input: literature triage (Track I); curation workflow (Track II) and text mining/natural language processing (NLP) systems (Track III). Track I participants were invited to develop tools or systems that would effectively triage and prioritize articles for curation and present results in a prototype web interface. Training and test datasets were derived from the Comparative Toxicogenomics Database (CTD; http://ctdbase.org) and consisted of manuscripts from which chemical–gene–disease data were manually curated. A total of seven groups participated in Track I. For the triage component, the effectiveness of participant systems was measured by aggregate gene, disease and chemical ‘named-entity recognition’ (NER) across articles; the effectiveness of ‘information retrieval’ (IR) was also measured based on ‘mean average precision’ (MAP). Top recall scores for gene, disease and chemical NER were 49, 65 and 82%, respectively; the top MAP score was 80%. Each participating group also developed a prototype web interface; these interfaces were evaluated based on functionality and ease-of-use by CTD’s biocuration project manager. In this article, we present a detailed description of the challenge and a summary of the results.}, number={0}, journal={Database}, publisher={Oxford University Press (OUP)}, author={Wiegers, T. C. and Davis, A. P. and Mattingly, C. J.}, year={2012}, month={Nov}, pages={bas037–bas037} } @article{king_davis_rosenstein_wiegers_mattingly_2012, title={Ranking Transitive Chemical-Disease Inferences Using Local Network Topology in the Comparative Toxicogenomics Database}, volume={7}, ISSN={["1932-6203"]}, DOI={10.1371/journal.pone.0046524}, abstractNote={Exposure to chemicals in the environment is believed to play a critical role in the etiology of many human diseases. To enhance understanding about environmental effects on human health, the Comparative Toxicogenomics Database (CTD; http://ctdbase.org) provides unique curated data that enable development of novel hypotheses about the relationships between chemicals and diseases. CTD biocurators read the literature and curate direct relationships between chemicals-genes, genes-diseases, and chemicals-diseases. These direct relationships are then computationally integrated to create additional inferred relationships; for example, a direct chemical-gene statement can be combined with a direct gene-disease statement to generate a chemical-disease inference (inferred via the shared gene). In CTD, the number of inferences has increased exponentially as the number of direct chemical, gene and disease interactions has grown. To help users navigate and prioritize these inferences for hypothesis development, we implemented a statistic to score and rank them based on the topology of the local network consisting of the chemical, disease and each of the genes used to make an inference. In this network, chemicals, diseases and genes are nodes connected by edges representing the curated interactions. Like other biological networks, node connectivity is an important consideration when evaluating the CTD network, as the connectivity of nodes follows the power-law distribution. Topological methods reduce the influence of highly connected nodes that are present in biological networks. We evaluated published methods that used local network topology to determine the reliability of protein–protein interactions derived from high-throughput assays. We developed a new metric that combines and weights two of these methods and uniquely takes into account the number of common neighbors and the connectivity of each entity involved. We present several CTD inferences as case studies to demonstrate the value of this metric and the biological relevance of the inferences.}, number={11}, journal={PLOS ONE}, author={King, Benjamin L. and Davis, Allan Peter and Rosenstein, Michael C. and Wiegers, Thomas C. and Mattingly, Carolyn J.}, year={2012}, month={Nov} } @article{davis_johnson_lennon-hopkins_sciaky_rosenstein_wiegers_mattingly_2012, title={Targeted journal curation as a method to improve data currency at the Comparative Toxicogenomics Database}, journal={Database-The Journal of Biological Databases and Curation}, author={Davis, A. P. and Johnson, R. J. and Lennon-Hopkins, K. and Sciaky, D. and Rosenstein, M. C. and Wiegers, T. C. and Mattingly, C. J.}, year={2012} }