@article{yang_xu_conant_kishino_thorne_ji_2023, title={Interlocus Gene Conversion, Natural Selection, and Paralog Homogenization}, volume={40}, ISSN={["1537-1719"]}, DOI={10.1093/molbev/msad198}, abstractNote={Abstract}, number={9}, journal={MOLECULAR BIOLOGY AND EVOLUTION}, author={Yang, Yixuan and Xu, Tanchumin and Conant, Gavin and Kishino, Hirohisa and Thorne, Jeffrey L. and Ji, Xiang}, year={2023}, month={Sep} } @article{ji_fisher_su_thorne_potter_lemey_baele_suchard_2023, title={Scalable Bayesian Divergence Time Estimation With Ratio Transformations}, ISSN={["1076-836X"]}, DOI={10.1093/sysbio/syad039}, abstractNote={Abstract}, journal={SYSTEMATIC BIOLOGY}, author={Ji, Xiang and Fisher, Alexander A. and Su, Shuo and Thorne, Jeffrey L. and Potter, Barney and Lemey, Philippe and Baele, Guy and Suchard, Marc A.}, year={2023}, month={Jul} } @article{hao_fleming_petterson_lyons_edger_pires_thorne_conant_2022, title={Convergent evolution of polyploid genomes from across the eukaryotic tree of life}, volume={5}, ISSN={["2160-1836"]}, DOI={10.1093/g3journal/jkac094}, abstractNote={Abstract}, journal={G3-GENES GENOMES GENETICS}, author={Hao, Yue and Fleming, Jonathon and Petterson, Joanna and Lyons, Eric and Edger, Patrick P. and Pires, J. Chris and Thorne, Jeffrey L. and Conant, Gavin C.}, year={2022}, month={May} } @article{seo_redelings_thorne_2022, title={Correlations between alignment gaps and nucleotide substitution or amino acid replacement}, volume={119}, ISSN={["1091-6490"]}, DOI={10.1073/pnas.2204435119}, abstractNote={To assess the conventional treatment in evolutionary inference of alignment gaps as missing data, we propose a simple nonparametric test of the null hypothesis that the locations of alignment gaps are independent of the nucleotide substitution or amino acid replacement process. When we apply the test to 1,390 protein alignments that are informed by protein tertiary structure and use a 5% significance level, the null hypothesis of independence between amino acid replacement and gap location is rejected for ∼65% of datasets. Via simulations that include substitution and insertion–deletion, we show that the test performs well with true alignments. When we simulate according to the null hypothesis and then apply the test to optimal alignments that are inferred by each of four widely used software packages, the null hypothesis is rejected too frequently. Via further simulations and analyses, we show that the overly frequent rejections of the null hypothesis are not solely due to weaknesses of widely used software for finding optimal alignments. Instead, our evidence suggests that optimal alignments are unrepresentative of true alignments and that biased evolutionary inferences may result from relying upon individual optimal alignments.}, number={34}, journal={PROCEEDINGS OF THE NATIONAL ACADEMY OF SCIENCES OF THE UNITED STATES OF AMERICA}, author={Seo, Tae-Kun and Redelings, Benjamin D. and Thorne, Jeffrey L.}, year={2022}, month={Aug} } @article{gunady_ware_plumlee_devos_corcoran_prinz_misetic_ciccarelli_harrison_thorne_et al._2022, title={Exome sequencing of hepatocellular carcinoma in lemurs identifies potential cancer drivers A pilot study}, volume={10}, ISSN={["2050-6201"]}, DOI={10.1093/emph/eoac016}, abstractNote={Abstract}, number={1}, journal={EVOLUTION MEDICINE AND PUBLIC HEALTH}, author={Gunady, Ella F. and Ware, Kathryn E. and Plumlee, Sarah Hoskinson and Devos, Nicolas and Corcoran, David and Prinz, Joseph and Misetic, Hrvoje and Ciccarelli, Francesca D. and Harrison, Tara M. and Thorne, Jeffrey L. and et al.}, year={2022}, month={Jan}, pages={221–230} } @article{seo_gascuel_thorne_2021, title={Measuring Phylogenetic Information of Incomplete Sequence Data}, ISSN={["1076-836X"]}, DOI={10.1093/sysbio/syab073}, abstractNote={Abstract}, journal={SYSTEMATIC BIOLOGY}, author={Seo, Tae-Kun and Gascuel, Olivier and Thorne, Jeffrey L.}, year={2021}, month={Sep} } @article{campbell_tiley_poelstra_hunnicutt_larsen_lee_thorne_reis_yoder_2021, title={Pedigree-based and phylogenetic methods support surprising patterns of mutation rate and spectrum in the gray mouse lemur}, ISSN={["1365-2540"]}, DOI={10.1038/s41437-021-00446-5}, abstractNote={Mutations are the raw material on which evolution acts, and knowledge of their frequency and genomic distribution is crucial for understanding how evolution operates at both long and short timescales. At present, the rate and spectrum of de novo mutations have been directly characterized in relatively few lineages. Our study provides the first direct mutation-rate estimate for a strepsirrhine (i.e., the lemurs and lorises), which comprises nearly half of the primate clade. Using high-coverage linked-read sequencing for a focal quartet of gray mouse lemurs (Microcebus murinus), we estimated the mutation rate to be among the highest calculated for a mammal at 1.52 × 10–8 (95% credible interval: 1.28 × 10−8–1.78 × 10−8) mutations/site/generation. Further, we found an unexpectedly low count of paternal mutations, and only a modest overrepresentation of mutations at CpG sites. Despite the surprising nature of these results, we found both the rate and spectrum to be robust to the manipulation of a wide range of computational filtering criteria. We also sequenced a technical replicate to estimate a false-negative and false-positive rate for our data and show that any point estimate of a de novo mutation rate should be considered with a large degree of uncertainty. For validation, we conducted an independent analysis of context-dependent substitution types for gray mouse lemur and five additional primate species for which de novo mutation rates have also been estimated. These comparisons revealed general consistency of the mutation spectrum between the pedigree-based and the substitution-rate analyses for all species compared.}, journal={HEREDITY}, author={Campbell, C. Ryan and Tiley, George P. and Poelstra, Jelmer W. and Hunnicutt, Kelsie E. and Larsen, Peter A. and Lee, Hui-Jie and Thorne, Jeffrey L. and Reis, Mario and Yoder, Anne D.}, year={2021}, month={Jul} } @article{somarelli_boddy_gardner_dewitt_tuohy_megquier_sheth_hsu_thorne_london_et al._2020, title={Improving Cancer Drug Discovery by Studying Cancer across the Tree of Life}, volume={37}, ISSN={["1537-1719"]}, DOI={10.1093/molbev/msz254}, abstractNote={Abstract}, number={1}, journal={MOLECULAR BIOLOGY AND EVOLUTION}, author={Somarelli, Jason A. and Boddy, Amy M. and Gardner, Heather L. and DeWitt, Suzanne Bartholf and Tuohy, Joanne and Megquier, Kate and Sheth, Maya U. and Hsu, Shiaowen David and Thorne, Jeffrey L. and London, Cheryl A. and et al.}, year={2020}, month={Jan}, pages={11–17} } @article{larson_thorne_schmidler_2020, title={Incorporating Nearest-Neighbor Site Dependence into Protein Evolution Models}, volume={27}, ISSN={["1557-8666"]}, DOI={10.1089/cmb.2019.0500}, abstractNote={Evolutionary models of proteins are widely used for statistical sequence alignment and inference of homology and phylogeny. However, the vast majority of these models rely on an unrealistic assumption of independent evolution between sites. Here we focus on the related problem of protein structure alignment, a classic tool of computational biology that is widely used to identify structural and functional similarity and to infer homology among proteins. A site-independent statistical model for protein structural evolution has previously been introduced and shown to significantly improve alignments and phylogenetic inferences compared with approaches that utilize only amino acid sequence information. Here we extend this model to account for correlated evolutionary drift among neighboring amino acid positions. The result is a spatiotemporal model of protein structure evolution, described by a multivariate diffusion process convolved with a spatial birth-death process. This extended site-dependent model (SDM) comes with little additional computational cost or analytical complexity compared with the site-independent model (SIM). We demonstrate that this SDM yields a significant reduction of bias in estimated evolutionary distances and helps further improve phylogenetic tree reconstruction. We also develop a simple model of site-dependent sequence evolution, which we use to demonstrate the bias resulting from the application of standard site-independent sequence evolution models.}, number={3}, journal={JOURNAL OF COMPUTATIONAL BIOLOGY}, author={Larson, Gary and Thorne, Jeffrey L. and Schmidler, Scott}, year={2020}, month={Mar}, pages={361–375} } @article{larson_thorne_schmidler_2018, title={Modeling Dependence in Evolutionary Inference for Proteins}, volume={10812}, ISBN={["978-3-319-89928-2"]}, ISSN={["1611-3349"]}, DOI={10.1007/978-3-319-89929-9_8}, abstractNote={Protein structure alignment is a classic problem of computational biology, and is widely used to identify structural and functional similarity and to infer homology among proteins. Previously a statistical model for protein structural evolution has been introduced and shown to significantly improve phylogenetic inferences compared to approaches that utilize only amino acid sequence information. Here we extend this model to account for correlated evolutionary drift among neighboring amino acid positions, resulting in a spatio-temporal model of protein structure evolution. The result is a multivariate diffusion process convolved with a spatial birth-death process, which comes with little additional computational cost or analytical complexity compared to the site-independent model (SIM). We demonstrate that this extended, site-dependent model (SDM) yields a significant reduction of bias in estimated evolutionary distances and helps further improve phylogenetic tree reconstruction.}, journal={RESEARCH IN COMPUTATIONAL MOLECULAR BIOLOGY, RECOMB 2018}, author={Larson, Gary and Thorne, Jeffrey L. and Schmidler, Scott}, year={2018}, pages={122–137} } @article{ji_griffing_thorne_2016, title={A Phylogenetic Approach Finds Abundant Interlocus Gene Conversion in Yeast}, volume={33}, ISSN={["1537-1719"]}, DOI={10.1093/molbev/msw114}, abstractNote={Interlocus gene conversion (IGC) homogenizes repeats. While genomes can be repeat-rich, the evolutionary importance of IGC is poorly understood. Additional statistical tools for characterizing it are needed. We propose a composite likelihood strategy for incorporating IGC into widely-used probabilistic models for sequence changes that originate with point mutation. We estimated the percentage of nucleotide substitutions that originate with an IGC event rather than a point mutation in 14 groups of yeast ribosomal protein-coding genes, and found values ranging from 20% to 38%. We designed and applied a procedure to determine whether these percentages are inflated due to artifacts arising from model misspecification. The results of this procedure are consistent with IGC having had an important role in the evolution of each of these 14 gene families. We further investigate the properties of our IGC approach via simulation. In contrast to usual practice, our findings suggest that the IGC should and can be considered when multigene family evolution is investigated.}, number={9}, journal={MOLECULAR BIOLOGY AND EVOLUTION}, author={Ji, Xiang and Griffing, Alexander and Thorne, Jeffrey L.}, year={2016}, month={Sep}, pages={2469–2476} } @article{lee_kishino_rodrigue_thorne_2016, title={Grouping substitution types into different relaxed molecular clocks}, volume={371}, number={1699}, journal={Philosophical Transactions of the Royal Society of London. Series B, Biological Sciences}, author={Lee, H. J. and Kishino, H. and Rodrigue, N. and Thorne, J. L.}, year={2016} } @article{lassiter_russ_nusbaum_zeng_saville_olarte_carbone_hu_seguin-orlando_samaniego_et al._2015, title={Mitochondrial genome sequences reveal evolutionary relationships of the Phytophthora 1c clade species}, volume={61}, ISSN={0172-8083 1432-0983}, url={http://dx.doi.org/10.1007/s00294-015-0480-3}, DOI={10.1007/s00294-015-0480-3}, abstractNote={Phytophthora infestans is one of the most destructive plant pathogens of potato and tomato globally. The pathogen is closely related to four other Phytophthora species in the 1c clade including P. phaseoli, P. ipomoeae, P. mirabilis and P. andina that are important pathogens of other wild and domesticated hosts. P. andina is an interspecific hybrid between P. infestans and an unknown Phytophthora species. We have sequenced mitochondrial genomes of the sister species of P. infestans and examined the evolutionary relationships within the clade. Phylogenetic analysis indicates that the P. phaseoli mitochondrial lineage is basal within the clade. P. mirabilis and P. ipomoeae are sister lineages and share a common ancestor with the Ic mitochondrial lineage of P. andina. These lineages in turn are sister to the P. infestans and P. andina Ia mitochondrial lineages. The P. andina Ic lineage diverged much earlier than the P. andina Ia mitochondrial lineage and P. infestans. The presence of two mitochondrial lineages in P. andina supports the hybrid nature of this species. The ancestral state of the P. andina Ic lineage in the tree and its occurrence only in the Andean regions of Ecuador, Colombia and Peru suggests that the origin of this species hybrid in nature may occur there.}, number={4}, journal={Current Genetics}, publisher={Springer Science and Business Media LLC}, author={Lassiter, Erica S. and Russ, Carsten and Nusbaum, Chad and Zeng, Qiandong and Saville, Amanda C. and Olarte, Rodrigo A. and Carbone, Ignazio and Hu, Chia-Hui and Seguin-Orlando, Andaine and Samaniego, Jose A. and et al.}, year={2015}, month={Mar}, pages={567–577} } @article{lee_rodrigue_thorne_2015, title={Relaxing the Molecular Clock to Different Degrees for Different Substitution Types}, volume={32}, ISSN={["1537-1719"]}, DOI={10.1093/molbev/msv099}, abstractNote={Rates of molecular evolution can vary over time. Diverse statistical techniques for divergence time estimation have been developed to accommodate this variation. These typically require that all sequence (or codon) positions at a locus change independently of one another. They also generally assume that the rates of different types of nucleotide substitutions vary across a phylogeny in the same way. This permits divergence time estimation procedures to employ an instantaneous rate matrix with relative rates that do not differ among branches. However, previous studies have suggested that some substitution types (e.g., CpG to TpG changes in mammals) are more clock-like than others. As has been previously noted, this is biologically plausible given the mutational mechanism of CpG to TpG changes. Through stochastic mapping of sequence histories from context-independent substitution models, our approach allows for context-dependent nucleotide substitutions to change their relative rates over time. We apply our approach to the analysis of a 0.15 Mb intergenic region from eight primates. In accord with previous findings, we find comparatively little rate variation over time for CpG to TpG substitutions but we find more for other substitution types. We conclude by discussing the limitations and prospects of our approach.}, number={8}, journal={MOLECULAR BIOLOGY AND EVOLUTION}, author={Lee, Hui-Jie and Rodrigue, Nicolas and Thorne, Jeffrey L.}, year={2015}, month={Aug}, pages={1948–1961} } @article{wang_yu_ji_lakner_griffing_thorne_2015, title={Roles of Solvent Accessibility and Gene Expression in Modeling Protein Sequence Evolution}, volume={11}, ISSN={["1176-9343"]}, DOI={10.4137/ebo.s22911}, abstractNote={Models of protein evolution tend to ignore functional constraints, although structural constraints are sometimes incorporated. Here we propose a probabilistic framework for codon substitution that evaluates joint effects of relative solvent accessibility (RSA), a structural constraint; and gene expression, a functional constraint. First, we explore the relationship between RSA and codon usage at the genomic scale as well as at the individual gene scale. Motivated by these results, we construct our framework by determining how probable is an amino acid, given RSA and gene expression, and then evaluating the relative probability of observing a codon compared to other synonymous codons. We come to the biologically plausible conclusion that both RSA and gene expression are related to amino acid frequencies, but, among synonymous codons, the relative probability of a particular codon is more closely related to gene expression than RSA. To illustrate the potential applications of our framework, we propose a new codon substitution model. Using this model, we obtain estimates of 27 N s, the product of effective population size N, and relative fitness difference of allele s. For a training data set consisting of human proteins with known structures and expression data, 2 N s is estimated separately for synonymous and nonsynonymous substitutions in each protein. We then contrast the patterns of synonymous and nonsynonymous 2 N s estimates across proteins while also taking gene expression levels of the proteins into account. We conclude that our 2 N s estimates are too concentrated around 0, and we discuss potential explanations for this lack of variability.}, journal={EVOLUTIONARY BIOINFORMATICS}, author={Wang, Kuangyu and Yu, Shuhui and Ji, Xiang and Lakner, Clemens and Griffing, Alexander and Thorne, Jeffrey L.}, year={2015} } @article{hobolth_thorne_2014, title={Sampling and summary statistics of endpoint-conditioned paths in DNA sequence evolution}, journal={Bayesian phylogenetics: methods, algorithms, and applications}, author={Hobolth, A. and Thorne, J. L.}, year={2014}, pages={247–275} } @article{liberles_teichmann_bahar_bastolla_bloom_bornberg-bauer_colwell_de koning_dokholyan_echave_et al._2012, title={The interface of protein structure, protein biophysics, and molecular evolution}, volume={21}, ISSN={0961-8368}, url={http://dx.doi.org/10.1002/pro.2071}, DOI={10.1002/pro.2071}, abstractNote={Abstract}, number={6}, journal={Protein Science}, publisher={Wiley}, author={Liberles, David A. and Teichmann, Sarah A. and Bahar, Ivet and Bastolla, Ugo and Bloom, Jesse and Bornberg-Bauer, Erich and Colwell, Lucy J. and de Koning, A. P. Jason and Dokholyan, Nikolay V. and Echave, Julian and et al.}, year={2012}, month={Apr}, pages={769–785} } @misc{liberles_teichmann_bahar_bastolla_bloom_bornberg-bauer_colwell_koning_dokholyan_echave_et al._2012, title={The interface of protein structure, protein biophysics, and molecular evolution}, volume={21}, number={6}, journal={Protein Science}, author={Liberles, D. A. and Teichmann, S. A. and Bahar, I. and Bastolla, U. and Bloom, J. and Bornberg-Bauer, E. and Colwell, L. J. and Koning, A. P. J. and Dokholyan, N. V. and Echave, J. and et al.}, year={2012}, pages={769–785} } @article{yokoyama_thorne_wray_2011, title={Coordinated Genome-Wide Modifications within Proximal Promoter Cis-regulatory Elements during Vertebrate Evolution}, volume={3}, ISSN={["1759-6653"]}, DOI={10.1093/gbe/evq078}, abstractNote={There often exists a “one-to-many” relationship between a transcription factor and a multitude of binding sites throughout the genome. It is commonly assumed that transcription factor binding motifs remain largely static over the course of evolution because changes in binding specificity can alter the interactions with potentially hundreds of sites across the genome. Focusing on regulatory motifs overrepresented at specific locations within or near the promoter, we find that a surprisingly large number of cis-regulatory elements have been subject to coordinated genome-wide modifications during vertebrate evolution, such that the motif frequency changes on a single branch of vertebrate phylogeny. This was found to be the case even between closely related mammal species, with nearly a third of all location-specific consensus motifs exhibiting significant modifications within the human or mouse lineage since their divergence. Many of these modifications are likely to be compensatory changes throughout the genome following changes in protein factor binding affinities, whereas others may be due to changes in mutation rates or effective population size. The likelihood that this happened many times during vertebrate evolution highlights the need to examine additional taxa and to understand the evolutionary and molecular mechanisms underlying the evolution of protein–DNA interactions.}, journal={GENOME BIOLOGY AND EVOLUTION}, author={Yokoyama, Ken Daigoro and Thorne, Jeffrey L. and Wray, Gregory A.}, year={2011}, pages={66–74} } @article{cartwright_lartillot_thorne_2011, title={History Can Matter: Non-Markovian Behavior of Ancestral Lineages}, volume={60}, ISSN={["1076-836X"]}, DOI={10.1093/sysbio/syr012}, abstractNote={Although most of the important evolutionary events in the history of biology can only be studied via interspecific comparisons, it is challenging to apply the rich body of population genetic theory to the study of interspecific genetic variation. Probabilistic modeling of the substitution process would ideally be derived from first principles of population genetics, allowing a quantitative connection to be made between the parameters describing mutation, selection, drift, and the patterns of interspecific variation. There has been progress in reconciling population genetics and interspecific evolution for the case where mutation rates are sufficiently low, but when mutation rates are higher, reconciliation has been hampered due to complications from how the loss or fixation of new mutations can be influenced by linked nonneutral polymorphisms (i.e., the Hill-Robertson effect). To investigate the generation of interspecific genetic variation when concurrent fitness-affecting polymorphisms are common and the Hill-Robertson effect is thereby potentially strong, we used the Wright-Fisher model of population genetics to simulate very many generations of mutation, natural selection, and genetic drift. This was done so that the chronological history of advantageous, deleterious, and neutral substitutions could be traced over time along the ancestral lineage. Our simulations show that the process by which a nonrecombining sequence changes over time can markedly deviate from the Markov assumption that is ubiquitous in molecular phylogenetics. In particular, we find tendencies for advantageous substitutions to be followed by deleterious ones and for deleterious substitutions to be followed by advantageous ones. Such non-Markovian patterns reflect the fact that the fate of the ancestral lineage depends not only on its current allelic state but also on gene copies not belonging to the ancestral lineage. Although our simulations describe nonrecombining sequences, we conclude by discussing how non-Markovian behavior of the ancestral lineage is plausible even when recombination rates are not low. As a result, we believe that increased attention needs to be devoted to the robustness of evolutionary inference procedures that rely upon the Markov assumption.}, number={3}, journal={SYSTEMATIC BIOLOGY}, author={Cartwright, Reed A. and Lartillot, Nicolas and Thorne, Jeffrey L.}, year={2011}, month={May}, pages={276–290} } @article{choi_stone_kishino_thorne_2009, title={Estimates of natural selection due to protein tertiary structure inform the ancestry of biallelic loci}, volume={441}, DOI={10.1016/j.gene.2008.07.020}, abstractNote={We consider the inference of which of two alleles is ancestral when the alleles have a single nonsynonymous difference and when natural selection acts via protein tertiary structure. Whereas the probability that an allele is ancestral under neutrality is equal to its frequency, under selection this probability depends on allele frequency and on the magnitude and direction of selection pressure. Although allele frequencies can be well estimated from intraspecific data, small fitness differences have a large evolutionary impact but can be difficult to estimate with only intraspecific data. Methods for predicting aspects of phenotype from genotype can supplement intraspecific sequence data. Recently developed statistical techniques can assess effects of phenotypes, such as protein tertiary structure on molecular evolution. While these techniques were initially designed for comparing protein-coding genes from different species, the resulting interspecific inferences can be assigned population genetic interpretations to assess the effect of selection pressure, and we use them here along with intraspecific allele frequency data to estimate the probability that an allele is ancestral. We focus on 140 nonsynonymous single nucleotide polymorphisms of humans that are in proteins with known tertiary structures. We find that our technique for employing protein tertiary structure information yields some biologically plausible results but that it does not substantially improve the inference of ancestral human allele types.}, number={1-2}, journal={Gene}, author={Choi, S. C. and Stone, E. A. and Kishino, H. and Thorne, J. L.}, year={2009}, pages={45–52} } @article{choi_redelings_thorne_2008, title={Basing population genetic inferences and models of molecular evolution upon desired stationary distributions of DNA or protein sequences}, volume={363}, ISSN={["0962-8436"]}, DOI={10.1098/rstb.2008.0167}, abstractNote={Models of molecular evolution tend to be overly simplistic caricatures of biology that are prone to assigning high probabilities to biologically implausible DNA or protein sequences. Here, we explore how to construct time-reversible evolutionary models that yield stationary distributions of sequences that match given target distributions. By adopting comparatively realistic target distributions, evolutionary models can be improved. Instead of focusing on estimating parameters, we concentrate on the population genetic implications of these models. Specifically, we obtain estimates of the product of effective population size and relative fitness difference of alleles. The approach is illustrated with two applications to protein-coding DNA. In the first, a codon-based evolutionary model yields a stationary distribution of sequences, which, when the sequences are translated, matches a variable-length Markov model trained on human proteins. In the second, we introduce an insertion–deletion model that describes selectively neutral evolutionary changes to DNA. We then show how to modify the neutral model so that its stationary distribution at the amino acid level can match a profile hidden Markov model, such as the one associated with the Pfam database.}, number={1512}, journal={PHILOSOPHICAL TRANSACTIONS OF THE ROYAL SOCIETY B-BIOLOGICAL SCIENCES}, author={Choi, Sang Chul and Redelings, Benjamin D. and Thorne, Jeffrey L.}, year={2008}, month={Dec}, pages={3931–3939} } @article{xiang_thorne_seo_zhang_thomas_ricklefs_2008, title={Rates of nucleotide substitution in Cornaceae (Cornales)—Pattern of variation and underlying causal factors}, volume={49}, ISSN={10557903}, url={https://linkinghub.elsevier.com/retrieve/pii/S1055790308003606}, DOI={10.1016/j.ympev.2008.07.010}, abstractNote={Identifying causes of genetic divergence is a central goal in evolutionary biology. Although rates of nucleotide substitution vary among taxa and among genes, the causes of this variation tend to be poorly understood. In the present study, we examined the rate and pattern of molecular evolution for five DNA regions over a phylogeny of Cornus, the single genus of Cornaceae. To identify evolutionary mechanisms underlying the molecular variation, we employed Bayesian methods to estimate divergence times and to infer how absolute rates of synonymous and nonsynonymous substitutions and their ratios change over time. We found that the rates vary among genes, lineages, and through time, and differences in mutation rates, selection type and intensity, and possibly genetic drift all contributed to the variation of substitution rates observed among the major lineages of Cornus. We applied independent contrast analysis to explore whether speciation rates are linked to rates of molecular evolution. The results showed no relationships for individual genes, but suggested a possible localized link between species richness and rate of nonsynonymous nucleotide substitution for the combined cpDNA regions. Furthermore, we detected a positive correlation between rates of molecular evolution and morphological change in Cornus. This was particularly pronounced in the dwarf dogwood lineage, in which genome-wide acceleration in both molecular and morphological evolution has likely occurred.}, number={1}, journal={Molecular Phylogenetics and Evolution}, author={Xiang, Qiu-Yun (Jenny) and Thorne, Jeffrey L. and Seo, Tae-Kun and Zhang, Wenheng and Thomas, David T. and Ricklefs, Robert E.}, year={2008}, month={Oct}, pages={327–342} } @article{lin_fang_thorne_2007, title={A tabu search algorithm for maximum parsimony phylogeny inference}, volume={176}, ISSN={["1872-6860"]}, DOI={10.1016/j.ejor.2005.10.031}, abstractNote={Phylogeny reconstruction is too complex a combinatorial problem for an exhaustive search, because the number of possible solutions increases exponentially with the number of taxa involved. In this paper, we adopt the parsimony principle and design a tabu search algorithm for finding a most parsimonious phylogeny tree. A special array structure is employed to represent the topology of trees and to generate the neighboring trees. We test the proposed tabu search algorithm on randomly selected data sets obtained from nuclear ribosomal DNA sequence data. The experiments show that our algorithm explores fewer trees to reach the optimal one than the commonly used program “dnapenny” (branch-and-bound based) while it generates much more accurate results than the default options of the program “dnapars” (heuristic search based). The percentage of search space needed to find the best solution for our algorithm decreased rapidly as the number of taxa increased. For a 20-taxon phylogeny problem, it needs on average to examine only 3.92 × 10−15% of the sample space.}, number={3}, journal={EUROPEAN JOURNAL OF OPERATIONAL RESEARCH}, author={Lin, Yu-Min and Fang, Shu-Cherng and Thorne, Jeffrey L.}, year={2007}, month={Feb}, pages={1908–1917} } @article{thorne_choi_yu_higgs_kishino_2007, title={Population genetics without intraspecific data}, volume={24}, ISSN={["1537-1719"]}, DOI={10.1093/molbev/msm085}, abstractNote={A central goal of computational biology is the prediction of phenotype from DNA and protein sequence data. Recent models of sequence change use in silico prediction systems to incorporate the effects of phenotype on evolutionary rates. These models have been designed for analyzing sequence data from different species and have been accompanied by statistical techniques for estimating model parameters when the incorporation of phenotype induces dependent change among sequence positions. A difficulty with these efforts to link phenotype and interspecific evolution is that evolution occurs within populations, and parameters of interspecific models should have population genetic interpretations. We show, with two examples, how population genetic interpretations can be assigned to evolutionary models. The first example considers the impact of RNA secondary structure on sequence change, and the second reflects the tendency for protein tertiary structure to influence nonsynonymous substitution rates. We argue that statistical fit to data should not be the sole criterion for assessing models of sequence change. A good interspecific model should also yield a clear and biologically plausible population genetic interpretation.}, number={8}, journal={MOLECULAR BIOLOGY AND EVOLUTION}, author={Thorne, Jeffrey L. and Choi, Sang Chul and Yu, Jiaye and Higgs, Paul G. and Kishino, Hirohisa}, year={2007}, month={Aug}, pages={1667–1677} } @article{thorne_2007, title={Protein evolution constraints and model-based techniques to study them}, volume={17}, ISSN={["1879-033X"]}, DOI={10.1016/j.sbi.2007.05.006}, abstractNote={There have been substantial improvements in statistical tools for assessing the evolutionary roles of mutation and natural selection from interspecific sequence data. The importance of having the rate at which a point mutation occurs depend on the DNA sequence at sites surrounding the mutation is now better appreciated and can be accommodated in probabilistic models of protein evolution. To quantify the evolutionary impact of some aspect of phenotype, one promising strategy is to develop a system for predicting phenotype from the DNA sequence and to then infer how the evolutionary rates of sequence change are affected by the predicted phenotypic consequences of the changes. Although statistical tools for characterizing protein evolution are improving, the list of candidate phenomena that can affect rates of protein evolution is long and the relative contributions of these phenomena are only beginning to be disentangled.}, number={3}, journal={CURRENT OPINION IN STRUCTURAL BIOLOGY}, author={Thorne, Jeffrey L.}, year={2007}, month={Jun}, pages={337–341} } @article{choi_hobolth_robinson_kishino_thorne_2007, title={Quantifying the impact of protein tertiary structure on molecular evolution}, volume={24}, ISSN={["0737-4038"]}, DOI={10.1093/molbev/msm097}, abstractNote={To investigate the evolutionary impact of protein structure, the experimentally determined tertiary structure and the protein-coding DNA sequence were collected for each of 1,195 genes. These genes were studied via a model of sequence change that explicitly incorporates effects on evolutionary rates due to protein tertiary structure. In the model, these effects act via the solvent accessibility environments and pairwise amino acid interactions that are induced by tertiary structure. To compare the hypotheses that structure does and does not have a strong influence on evolution, Bayes factors were estimated for each of the 1,195 sequences. Most of the Bayes factors strongly support the hypothesis that protein structure affects protein evolution. Furthermore, both solvent accessibility and pairwise interactions among amino acids are inferred to have important roles in protein evolution. Our results also indicate that the strength of the relationship between tertiary structure and evolution has a weak but real correlation to the annotation information in the Gene Ontology database. Although their influences on rates of evolution vary among protein families, we find that the mean impacts of solvent accessibility and pairwise interactions are about the same.}, number={8}, journal={MOLECULAR BIOLOGY AND EVOLUTION}, author={Choi, Sang Chul and Hobolth, Asger and Robinson, Douglas M. and Kishino, Hirohisa and Thorne, Jeffrey L.}, year={2007}, month={Aug}, pages={1769–1782} } @article{yu_thorne_2006, title={Dependence among sites in RNA evolution}, volume={23}, ISSN={["1537-1719"]}, DOI={10.1093/molbev/msl015}, abstractNote={Although probabilistic models of genotype (e.g., DNA sequence) evolution have been greatly elaborated, less attention has been paid to the effect of phenotype on the evolution of the genotype. Here we propose an evolutionary model and a Bayesian inference procedure that are aimed at filling this gap. In the model, RNA secondary structure links genotype and phenotype by treating the approximate free energy of a sequence folded into a secondary structure as a surrogate for fitness. The underlying idea is that a nucleotide substitution resulting in a more stable secondary structure should have a higher rate than a substitution that yields a less stable secondary structure. This free energy approach incorporates evolutionary dependencies among sequence positions beyond those that are reflected simply by jointly modeling change at paired positions in an RNA helix. Although there is not a formal requirement with this approach that secondary structure be known and nearly invariant over evolutionary time, computational considerations make these assumptions attractive and they have been adopted in a software program that permits statistical analysis of multiple homologous sequences that are related via a known phylogenetic tree topology. Analyses of 5S ribosomal RNA sequences are presented to illustrate and quantify the strong impact that RNA secondary structure has on substitution rates. Analyses on simulated sequences show that the new inference procedure has reasonable statistical properties. Potential applications of this procedure, including improved ancestral sequence inference and location of functionally interesting sites, are discussed.}, number={8}, journal={MOLECULAR BIOLOGY AND EVOLUTION}, author={Yu, Jiaye and Thorne, Jeffrey L.}, year={2006}, month={Aug}, pages={1525–1537} } @article{yu_thorne_2006, title={Testing for spatial clustering of amino acid replacements within protein tertiary structure}, volume={62}, ISSN={["1432-1432"]}, DOI={10.1007/s00239-005-0107-2}, abstractNote={Widely used models of protein evolution ignore protein structure. Therefore, these models do not predict spatial clustering of amino acid replacements with respect to tertiary structure. One formal and biologically implausible possibility is that there is no tendency for amino acid replacements to be spatially clustered during evolution. An alternative to this is that amino acid replacements are spatially clustered and this spatial clustering can be fully explained by a tendency for similar rates of amino acid replacement at sites that are nearby in protein tertiary structure. A third possibility is that the amount of clustering exceeds that which can be explained solely on the basis of independently evolving protein sites with spatially clustered replacement rates. We introduce two simple and not very parametric hypothesis tests that help distinguish these three possibilities. We then apply these tests to 273 homologous protein families. The null hypothesis of no spatial clustering is rejected for 102 of 273 families. The explanation of spatially clustered rates but independent change among sites is rejected for 43 families. These findings need to be reconciled with the common practice of basing evolutionary inferences on models that assume independent change among sites.}, number={6}, journal={JOURNAL OF MOLECULAR EVOLUTION}, author={Yu, Jiaye and Thorne, Jeffrey L.}, year={2006}, month={Jun}, pages={682–692} } @article{seo_kishino_thorne_2005, title={Incorporating gene-specific variation when inferring and evaluating optimal evolutionary tree topologies from multilocus sequence data}, volume={102}, ISSN={["0027-8424"]}, DOI={10.1073/pnas.0408313102}, abstractNote={Because of the increase of genomic data, multiple genes are often available for the inference of phylogenetic relationships. The simple approach for combining multiple genes from the same taxon is to concatenate the sequences and then ignore the fact that different positions in the concatenated sequence came from different genes. Here, we discuss two criteria for inferring the optimal tree topology from data sets with multiple genes. These criteria are designed for multigene data sets where gene-specific evolutionary features are too important to ignore. One criterion is conventional and is obtained by taking the sum of log-likelihoods over all genes. The other criterion is obtained by dividing the log-likelihood for a gene by its sequence length and then taking the arithmetic mean over genes of these ratios. A similar strategy could be adopted with parsimony scores. The optimal tree is then declared to be the one for which the sum or the arithmetic mean is maximized. These criteria are justified within a two-stage hierarchical framework. The first level of the hierarchy represents gene-specific evolutionary features, and the second represents site-specific features for given genes. For testing significance of the optimal topology, we suggest a two-stage bootstrap procedure that involves resampling genes and then resampling alignment columns within resampled genes. An advantage of this procedure over concatenation is that it can effectively account for gene-specific evolutionary features. We discuss the applicability of the two-stage bootstrap idea to the Kishino–Hasegawa test and the Shimodaira–Hasegawa test.}, number={12}, journal={PROCEEDINGS OF THE NATIONAL ACADEMY OF SCIENCES OF THE UNITED STATES OF AMERICA}, author={Seo, TK and Kishino, H and Thorne, JL}, year={2005}, month={Mar}, pages={4436–4441} } @article{thorne_2004, title={A Bayesian approach to DNA sequence segmentation - Discussion}, volume={60}, ISSN={["0006-341X"]}, DOI={10.1111/j.0006-341X.2004.206_5.x}, abstractNote={.}, number={3}, journal={BIOMETRICS}, author={Thorne, JL}, year={2004}, month={Sep}, pages={584–585} } @article{seo_kishino_thorne_2004, title={Estimating absolute rates of synonymous and nonsynonymous nucleotide substitution in order to characterize natural selection and date species divergences}, volume={21}, ISSN={["1537-1719"]}, DOI={10.1093/molbev/msh088}, abstractNote={The rate of molecular evolution can vary among lineages. Sources of this variation have differential effects on synonymous and nonsynonymous substitution rates. Changes in effective population size or patterns of natural selection will mainly alter nonsynonymous substitution rates. Changes in generation length or mutation rates are likely to have an impact on both synonymous and nonsynonymous substitution rates. By comparing changes in synonymous and nonsynonymous rates, the relative contributions of the driving forces of evolution can be better characterized. Here, we introduce a procedure for estimating the chronological rates of synonymous and nonsynonymous substitutions on the branches of an evolutionary tree. Because the widely used ratio of nonsynonymous and synonymous rates is not designed to detect simultaneous increases or simultaneous decreases in synonymous and nonsynonymous rates, the estimation of these rates rather than their ratio can improve characterization of the evolutionary process. With our Bayesian approach, we analyze cytochrome oxidase subunit I evolution in primates and infer that nonsynonymous rates have a greater tendency to change over time than do synonymous rates. Our analysis of these data also suggests that rates have been positively correlated.}, number={7}, journal={MOLECULAR BIOLOGY AND EVOLUTION}, author={Seo, TK and Kishino, H and Thorne, JL}, year={2004}, month={Jul}, pages={1201–1213} } @misc{sanderson_thorne_wikstrom_bremer_2004, title={Molecular evidence on plant divergence times}, volume={91}, ISSN={["1537-2197"]}, DOI={10.3732/ajb.91.10.1656}, abstractNote={Estimation of divergence times from sequence data has become increasingly feasible in recent years. Conflicts between fossil evidence and molecular dates have sparked the development of new methods for inferring divergence times, further encouraging these efforts. In this paper, available methods for estimating divergence times are reviewed, especially those geared toward handling the widespread variation in rates of molecular evolution observed among lineages. The assumptions, strengths, and weaknesses of local clock, Bayesian, and rate smoothing methods are described. The rapidly growing literature applying these methods to key divergence times in plant evolutionary history is also reviewed. These include the crown group ages of green plants, land plants, seed plants, angiosperms, and major subclades of angiosperms. Finally, attempts to infer divergence times are described in the context of two very different temporal settings: recent adaptive radiations and much more ancient biogeographic patterns.}, number={10}, journal={AMERICAN JOURNAL OF BOTANY}, author={Sanderson, MJ and Thorne, JL and Wikstrom, N and Bremer, K}, year={2004}, month={Oct}, pages={1656–1665} } @article{scholl_thorne_mccarter_bird_2003, title={Horizontally transferred genes in plant-parasitic nematodes: a high-throughput genomic approach}, volume={4}, number={6}, journal={Genome Biology}, author={Scholl, E. H. and Thorne, J. L. and McCarter, J. P. and Bird, D. M.}, year={2003}, pages={R39–1} } @article{robinson_jones_kishino_goldman_thorne_2003, title={Protein evolution with dependence among codons due to tertiary structure}, volume={20}, ISSN={["0737-4038"]}, DOI={10.1093/molbev/msg184}, abstractNote={Markovian models of protein evolution that relax the assumption of independent change among codons are considered. With this comparatively realistic framework, an evolutionary rate at a site can depend both on the state of the site and on the states of surrounding sites. By allowing a relatively general dependence structure among sites, models of evolution can reflect attributes of tertiary structure. To quantify the impact of protein structure on protein evolution, we analyze protein-coding DNA sequence pairs with an evolutionary model that incorporates effects of solvent accessibility and pairwise interactions among amino acid residues. By explicitly considering the relationship between nonsynonymous substitution rates and protein structure, this approach can lead to refined detection and characterization of positive selection. Analyses of simulated sequence pairs indicate that parameters in this evolutionary model can be well estimated. Analyses of lysozyme c and annexin V sequence pairs yield the biologically reasonable result that amino acid replacement rates are higher when the replacements lead to energetically favorable proteins than when they destabilize the proteins. Although the focus here is evolutionary dependence among codons that is associated with protein structure, the statistical approach is quite general and could be applied to diverse cases of evolutionary dependence where surrogates for sequence fitness can be measured or modeled.}, number={10}, journal={MOLECULAR BIOLOGY AND EVOLUTION}, author={Robinson, DM and Jones, DT and Kishino, H and Goldman, N and Thorne, JL}, year={2003}, month={Oct}, pages={1692–1704} } @article{wiegmann_yeates_thorne_kishino_2003, title={Time flies, a new molecular time-scale for brachyceran fly evolution without a clock}, volume={52}, number={6}, journal={Systematic Biology}, author={Wiegmann, B. M. and Yeates, D. K. and Thorne, J. L. and Kishino, H.}, year={2003}, pages={745–756} } @misc{hasegawa_thorne_kishino_2003, title={Time scale of eutherian evolution estimated without assuming a constant rate of molecular evolution}, volume={78}, ISSN={["1880-5779"]}, DOI={10.1266/ggs.78.267}, abstractNote={Controversies over the molecular clock hypothesis were reviewed. Since it is evident that the molecular clock does not hold in an exact sense, accounting for evolution of the rate of molecular evolution is a prerequisite when estimating divergence times with molecular sequences. Recently proposed statistical methods that account for this rate variation are overviewed and one of these procedures is applied to the mitochondrial protein sequences and to the nuclear gene sequences from many mammalian species in order to estimate the time scale of eutherian evolution. This Bayesian method not only takes account of the variation of molecular evolutionary rate among lineages and among genes, but it also incorporates fossil evidence via constraints on node times. With denser taxonomic sampling and a more realistic model of molecular evolution, this Bayesian approach is expected to increase the accuracy of divergence time estimates.}, number={4}, journal={GENES & GENETIC SYSTEMS}, author={Hasegawa, M and Thorne, JL and Kishino, H}, year={2003}, month={Aug}, pages={267–283} } @article{seo_thorne_hasegawa_kishino_2002, title={A viral sampling design for testing the molecular clock and for estimating evolutionary rates and divergence times}, volume={18}, ISSN={["1367-4803"]}, DOI={10.1093/bioinformatics/18.1.115}, abstractNote={Abstract}, number={1}, journal={BIOINFORMATICS}, author={Seo, TK and Thorne, JL and Hasegawa, M and Kishino, H}, year={2002}, month={Jan}, pages={115–123} } @article{thorne_kishino_2002, title={Divergence time and evolutionary rate estimation with multilocus data}, volume={51}, ISSN={["1076-836X"]}, DOI={10.1080/10635150290102456}, abstractNote={Bayesian methods for estimating evolutionary divergence times are extended to multigene data sets, and a technique is described for detecting correlated changes in evolutionary rates among genes. Simulations are employed to explore the effect of multigene data on divergence time estimation, and the methodology is illustrated with a previously published data set representing diverse plant taxa. The fact that evolutionary rates and times are confounded when sequence data are compared is emphasized and the importance of fossil information for disentangling rates and times is stressed.}, number={5}, journal={SYSTEMATIC BIOLOGY}, author={Thorne, JL and Kishino, H}, year={2002}, pages={689–702} } @article{seo_thorne_hasegawa_kishino_2002, title={Estimation of effective population size of HIV-1 within a host: A pseudomaximum-likelihood approach}, volume={160}, number={4}, journal={Genetics}, author={Seo, T. K. and Thorne, J. L. and Hasegawa, M. and Kishino, H.}, year={2002}, pages={1283–1293} } @article{kishino_thorne_bruno_2001, title={Performance of a divergence time estimation method under a probabilistic model of rate evolution}, volume={18}, ISSN={["0737-4038"]}, DOI={10.1093/oxfordjournals.molbev.a003811}, abstractNote={Rates of molecular evolution vary over time and, hence, among lineages. In contrast, widely used methods for estimating divergence times from molecular sequence data assume constancy of rates. Therefore, methods for estimation of divergence times that incorporate rate variation are attractive. Improvements on a previously proposed Bayesian technique for divergence time estimation are described. New parameterization more effectively captures the phylogenetic structure of rate evolution on a tree. Fossil information and other evidence can now be included in Bayesian analyses in the form of constraints on divergence times. Simulation results demonstrate that the accuracy of divergence time estimation is substantially enhanced when constraints are included.}, number={3}, journal={MOLECULAR BIOLOGY AND EVOLUTION}, author={Kishino, H and Thorne, JL and Bruno, WJ}, year={2001}, month={Mar}, pages={352–361} } @article{thorne_2000, title={Models of protein sequence evolution and their applications}, volume={10}, ISSN={["1879-0380"]}, DOI={10.1016/S0959-437X(00)00142-8}, abstractNote={Homologous sequences are correlated due to their common ancestry. Probabilistic models of sequence evolution are employed routinely to properly account for these phylogenetic correlations. These increasingly realistic models provide a basis for studying evolution and for exploiting it to better understand protein structure and function. Notable recent advances have been made in the treatment of insertion and deletion events, the estimation of amino-acid replacement rates, and the detection of positive selection.}, number={6}, journal={CURRENT OPINION IN GENETICS & DEVELOPMENT}, author={Thorne, JL}, year={2000}, month={Dec}, pages={602–605} } @article{goldman_thorne_jones_1998, title={Assessing the impact of secondary structure and solvent accessibility on protein evolution}, volume={149}, number={1}, journal={Genetics}, author={Goldman, N. and Thorne, J. L. and Jones, D. T.}, year={1998}, pages={445–458} } @article{thorne_kishino_painter_1998, title={Estimating the rate of evolution of the rate of molecular evolution}, volume={15}, ISSN={["0737-4038"]}, DOI={10.1093/oxfordjournals.molbev.a025892}, abstractNote={A simple model for the evolution of the rate of molecular evolution is presented. With a Bayesian approach, this model can serve as the basis for estimating dates of important evolutionary events even in the absence of the assumption of constant rates among evolutionary lineages. The method can be used in conjunction with any of the widely used models for nucleotide substitution or amino acid replacement. It is illustrated by analyzing a data set of rbcL protein sequences.}, number={12}, journal={MOLECULAR BIOLOGY AND EVOLUTION}, author={Thorne, JL and Kishino, H and Painter, IS}, year={1998}, month={Dec}, pages={1647–1657} } @article{lio_goldman_thorne_jones_1998, title={PASSML: combining evolutionary inference and protein secondary structure prediction}, volume={14}, ISSN={["1367-4803"]}, DOI={10.1093/bioinformatics/14.8.726}, abstractNote={Abstract}, number={8}, journal={BIOINFORMATICS}, author={Lio, P and Goldman, N and Thorne, JL and Jones, DT}, year={1998}, pages={726–733} }