@article{yang_xu_conant_kishino_thorne_ji_2023, title={Interlocus Gene Conversion, Natural Selection, and Paralog Homogenization}, volume={40}, ISSN={["1537-1719"]}, DOI={10.1093/molbev/msad198}, abstractNote={Abstract Following a duplication, the resulting paralogs tend to diverge. While mutation and natural selection can accelerate this process, they can also slow it. Here, we quantify the paralog homogenization that is caused by point mutations and interlocus gene conversion (IGC). Among 164 duplicated teleost genes, the median percentage of postduplication codon substitutions that arise from IGC rather than point mutation is estimated to be between 7% and 8%. By differentiating between the nonsynonymous codon substitutions that homogenize the protein sequences of paralogs and the nonhomogenizing nonsynonymous substitutions, we estimate the homogenizing nonsynonymous rates to be higher for 163 of the 164 teleost data sets as well as for all 14 data sets of duplicated yeast ribosomal protein-coding genes that we consider. For all 14 yeast data sets, the estimated homogenizing nonsynonymous rates exceed the synonymous rates.}, number={9}, journal={MOLECULAR BIOLOGY AND EVOLUTION}, author={Yang, Yixuan and Xu, Tanchumin and Conant, Gavin and Kishino, Hirohisa and Thorne, Jeffrey L. and Ji, Xiang}, year={2023}, month={Sep} } @article{ji_fisher_su_thorne_potter_lemey_baele_suchard_2023, title={Scalable Bayesian Divergence Time Estimation With Ratio Transformations}, ISSN={["1076-836X"]}, DOI={10.1093/sysbio/syad039}, abstractNote={Abstract Divergence time estimation is crucial to provide temporal signals for dating biologically important events from species divergence to viral transmissions in space and time. With the advent of high-throughput sequencing, recent Bayesian phylogenetic studies have analyzed hundreds to thousands of sequences. Such large-scale analyses challenge divergence time reconstruction by requiring inference on highly correlated internal node heights that often become computationally infeasible. To overcome this limitation, we explore a ratio transformation that maps the original $N-1$ internal node heights into a space of one height parameter and $N-2$ ratio parameters. To make the analyses scalable, we develop a collection of linear-time algorithms to compute the gradient and Jacobian-associated terms of the log-likelihood with respect to these ratios. We then apply Hamiltonian Monte Carlo sampling with the ratio transform in a Bayesian framework to learn the divergence times in 4 pathogenic viruses (West Nile virus, rabies virus, Lassa virus, and Ebola virus) and the coralline red algae. Our method both resolves a mixing issue in the West Nile virus example and improves inference efficiency by at least 5-fold for the Lassa and rabies virus examples as well as for the algae example. Our method now also makes it computationally feasible to incorporate mixed-effects molecular clock models for the Ebola virus example, confirms the findings from the original study, and reveals clearer multimodal distributions of the divergence times of some clades of interest.}, journal={SYSTEMATIC BIOLOGY}, author={Ji, Xiang and Fisher, Alexander A. and Su, Shuo and Thorne, Jeffrey L. and Potter, Barney and Lemey, Philippe and Baele, Guy and Suchard, Marc A.}, year={2023}, month={Jul} } @article{hao_fleming_petterson_lyons_edger_pires_thorne_conant_2022, title={Convergent evolution of polyploid genomes from across the eukaryotic tree of life}, volume={5}, ISSN={["2160-1836"]}, DOI={10.1093/g3journal/jkac094}, abstractNote={Abstract By modeling the homoeologous gene losses that occurred in 50 genomes deriving from ten distinct polyploidy events, we show that the evolutionary forces acting on polyploids are remarkably similar, regardless of whether they occur in flowering plants, ciliates, fishes, or yeasts. We show that many of the events show a relative rate of duplicate gene loss before the first postpolyploidy speciation that is significantly higher than in later phases of their evolution. The relatively weak selective constraint experienced by the single-copy genes these losses produced leads us to suggest that most of the purely selectively neutral duplicate gene losses occur in the immediate postpolyploid period. Nearly all of the events show strong evidence of biases in the duplicate losses, consistent with them being allopolyploidies, with 2 distinct progenitors contributing to the modern species. We also find ongoing and extensive reciprocal gene losses (alternative losses of duplicated ancestral genes) between these genomes. With the exception of a handful of closely related taxa, all of these polyploid organisms are separated from each other by tens to thousands of reciprocal gene losses. As a result, it is very unlikely that viable diploid hybrid species could form between these taxa, since matings between such hybrids would tend to produce offspring lacking essential genes. It is, therefore, possible that the relatively high frequency of recurrent polyploidies in some lineages may be due to the ability of new polyploidies to bypass reciprocal gene loss barriers.}, journal={G3-GENES GENOMES GENETICS}, author={Hao, Yue and Fleming, Jonathon and Petterson, Joanna and Lyons, Eric and Edger, Patrick P. and Pires, J. Chris and Thorne, Jeffrey L. and Conant, Gavin C.}, year={2022}, month={May} } @article{seo_redelings_thorne_2022, title={Correlations between alignment gaps and nucleotide substitution or amino acid replacement}, volume={119}, ISSN={["1091-6490"]}, DOI={10.1073/pnas.2204435119}, abstractNote={To assess the conventional treatment in evolutionary inference of alignment gaps as missing data, we propose a simple nonparametric test of the null hypothesis that the locations of alignment gaps are independent of the nucleotide substitution or amino acid replacement process. When we apply the test to 1,390 protein alignments that are informed by protein tertiary structure and use a 5% significance level, the null hypothesis of independence between amino acid replacement and gap location is rejected for ∼65% of datasets. Via simulations that include substitution and insertion–deletion, we show that the test performs well with true alignments. When we simulate according to the null hypothesis and then apply the test to optimal alignments that are inferred by each of four widely used software packages, the null hypothesis is rejected too frequently. Via further simulations and analyses, we show that the overly frequent rejections of the null hypothesis are not solely due to weaknesses of widely used software for finding optimal alignments. Instead, our evidence suggests that optimal alignments are unrepresentative of true alignments and that biased evolutionary inferences may result from relying upon individual optimal alignments.}, number={34}, journal={PROCEEDINGS OF THE NATIONAL ACADEMY OF SCIENCES OF THE UNITED STATES OF AMERICA}, author={Seo, Tae-Kun and Redelings, Benjamin D. and Thorne, Jeffrey L.}, year={2022}, month={Aug} } @article{gunady_ware_plumlee_devos_corcoran_prinz_misetic_ciccarelli_harrison_thorne_et al._2022, title={Exome sequencing of hepatocellular carcinoma in lemurs identifies potential cancer drivers A pilot study}, volume={10}, ISSN={["2050-6201"]}, DOI={10.1093/emph/eoac016}, abstractNote={Abstract Background and objectives Hepatocellular carcinoma occurs frequently in prosimians, but the cause of these liver cancers in this group is unknown. Characterizing the genetic changes associated with hepatocellular carcinoma in prosimians may point to possible causes, treatments and methods of prevention, aiding conservation efforts that are particularly crucial to the survival of endangered lemurs. Although genomic studies of cancer in non-human primates have been hampered by a lack of tools, recent studies have demonstrated the efficacy of using human exome capture reagents across primates. Methodology In this proof-of-principle study, we applied human exome capture reagents to tumor–normal pairs from five lemurs with hepatocellular carcinoma to characterize the mutational landscape of this disease in lemurs. Results Several genes implicated in human hepatocellular carcinoma, including ARID1A, TP53 and CTNNB1, were mutated in multiple lemurs, and analysis of cancer driver genes mutated in these samples identified enrichment of genes involved with TP53 degradation and regulation. In addition to these similarities with human hepatocellular carcinoma, we also noted unique features, including six genes that contain mutations in all five lemurs. Interestingly, these genes are infrequently mutated in human hepatocellular carcinoma, suggesting potential differences in the etiology and/or progression of this cancer in lemurs and humans. Conclusions and implications Collectively, this pilot study suggests that human exome capture reagents are a promising tool for genomic studies of cancer in lemurs and other non-human primates. Lay Summary Hepatocellular carcinoma occurs frequently in prosimians, but the cause of these liver cancers is unknown. In this proof-of-principle study, we applied human DNA sequencing tools to tumor–normal pairs from five lemurs with hepatocellular carcinoma and compared the lemur mutation profiles to those of human hepatocellular carcinomas. }, number={1}, journal={EVOLUTION MEDICINE AND PUBLIC HEALTH}, author={Gunady, Ella F. and Ware, Kathryn E. and Plumlee, Sarah Hoskinson and Devos, Nicolas and Corcoran, David and Prinz, Joseph and Misetic, Hrvoje and Ciccarelli, Francesca D. and Harrison, Tara M. and Thorne, Jeffrey L. and et al.}, year={2022}, month={Jan}, pages={221–230} } @article{seo_gascuel_thorne_2021, title={Measuring Phylogenetic Information of Incomplete Sequence Data}, ISSN={["1076-836X"]}, DOI={10.1093/sysbio/syab073}, abstractNote={Abstract Widely used approaches for extracting phylogenetic information from aligned sets of molecular sequences rely upon probabilistic models of nucleotide substitution or amino-acid replacement. The phylogenetic information that can be extracted depends on the number of columns in the sequence alignment and will be decreased when the alignment contains gaps due to insertion or deletion events. Motivated by the measurement of information loss, we suggest assessment of the effective sequence length (ESL) of an aligned data set. The ESL can differ from the actual number of columns in a sequence alignment because of the presence of alignment gaps. Furthermore, the estimation of phylogenetic information is affected by model misspecification. Inevitably, the actual process of molecular evolution differs from the probabilistic models employed to describe this process. This disparity means the amount of phylogenetic information in an actual sequence alignment will differ from the amount in a simulated data set of equal size, which motivated us to develop a new test for model adequacy. Via theory and empirical data analysis, we show how to disentangle the effects of gaps and model misspecification. By comparing the Fisher information of actual and simulated sequences, we identify which alignment sites and tree branches are most affected by gaps and model misspecification. [Fisher information; gaps; insertion; deletion; indel; model adequacy; goodness-of-fit test; sequence alignment.]}, journal={SYSTEMATIC BIOLOGY}, author={Seo, Tae-Kun and Gascuel, Olivier and Thorne, Jeffrey L.}, year={2021}, month={Sep} } @article{campbell_tiley_poelstra_hunnicutt_larsen_lee_thorne_reis_yoder_2021, title={Pedigree-based and phylogenetic methods support surprising patterns of mutation rate and spectrum in the gray mouse lemur}, ISSN={["1365-2540"]}, DOI={10.1038/s41437-021-00446-5}, abstractNote={Mutations are the raw material on which evolution acts, and knowledge of their frequency and genomic distribution is crucial for understanding how evolution operates at both long and short timescales. At present, the rate and spectrum of de novo mutations have been directly characterized in relatively few lineages. Our study provides the first direct mutation-rate estimate for a strepsirrhine (i.e., the lemurs and lorises), which comprises nearly half of the primate clade. Using high-coverage linked-read sequencing for a focal quartet of gray mouse lemurs (Microcebus murinus), we estimated the mutation rate to be among the highest calculated for a mammal at 1.52 × 10–8 (95% credible interval: 1.28 × 10−8–1.78 × 10−8) mutations/site/generation. Further, we found an unexpectedly low count of paternal mutations, and only a modest overrepresentation of mutations at CpG sites. Despite the surprising nature of these results, we found both the rate and spectrum to be robust to the manipulation of a wide range of computational filtering criteria. We also sequenced a technical replicate to estimate a false-negative and false-positive rate for our data and show that any point estimate of a de novo mutation rate should be considered with a large degree of uncertainty. For validation, we conducted an independent analysis of context-dependent substitution types for gray mouse lemur and five additional primate species for which de novo mutation rates have also been estimated. These comparisons revealed general consistency of the mutation spectrum between the pedigree-based and the substitution-rate analyses for all species compared.}, journal={HEREDITY}, author={Campbell, C. Ryan and Tiley, George P. and Poelstra, Jelmer W. and Hunnicutt, Kelsie E. and Larsen, Peter A. and Lee, Hui-Jie and Thorne, Jeffrey L. and Reis, Mario and Yoder, Anne D.}, year={2021}, month={Jul} } @article{larson_thorne_schmidler_2020, title={Incorporating Nearest-Neighbor Site Dependence into Protein Evolution Models}, volume={27}, ISSN={["1557-8666"]}, DOI={10.1089/cmb.2019.0500}, abstractNote={Evolutionary models of proteins are widely used for statistical sequence alignment and inference of homology and phylogeny. However, the vast majority of these models rely on an unrealistic assumption of independent evolution between sites. Here we focus on the related problem of protein structure alignment, a classic tool of computational biology that is widely used to identify structural and functional similarity and to infer homology among proteins. A site-independent statistical model for protein structural evolution has previously been introduced and shown to significantly improve alignments and phylogenetic inferences compared with approaches that utilize only amino acid sequence information. Here we extend this model to account for correlated evolutionary drift among neighboring amino acid positions. The result is a spatiotemporal model of protein structure evolution, described by a multivariate diffusion process convolved with a spatial birth-death process. This extended site-dependent model (SDM) comes with little additional computational cost or analytical complexity compared with the site-independent model (SIM). We demonstrate that this SDM yields a significant reduction of bias in estimated evolutionary distances and helps further improve phylogenetic tree reconstruction. We also develop a simple model of site-dependent sequence evolution, which we use to demonstrate the bias resulting from the application of standard site-independent sequence evolution models.}, number={3}, journal={JOURNAL OF COMPUTATIONAL BIOLOGY}, author={Larson, Gary and Thorne, Jeffrey L. and Schmidler, Scott}, year={2020}, month={Mar}, pages={361–375} } @article{somarelli_boddy_gardner_dewitt_tuohy_megquier_sheth_hsu_thorne_london_et al._2020, title={Improving Cancer Drug Discovery by Studying Cancer across the Tree of Life}, volume={37}, ISSN={["1537-1719"]}, DOI={10.1093/molbev/msz254}, abstractNote={AbstractDespite a considerable expenditure of time and resources and significant advances in experimental models of disease, cancer research continues to suffer from extremely low success rates in translating preclinical discoveries into clinical practice. The continued failure of cancer drug development, particularly late in the course of human testing, not only impacts patient outcomes, but also drives up the cost for those therapies that do succeed. It is clear that a paradigm shift is necessary if improvements in this process are to occur. One promising direction for increasing translational success is comparative oncology—the study of cancer across species, often involving veterinary patients that develop naturally-occurring cancers. Comparative oncology leverages the power of cross-species analyses to understand the fundamental drivers of cancer protective mechanisms, as well as factors contributing to cancer initiation and progression. Clinical trials in veterinary patients with cancer provide an opportunity to evaluate novel therapeutics in a setting that recapitulates many of the key features of human cancers, including genomic aberrations that underly tumor development, response and resistance to treatment, and the presence of comorbidities that can affect outcomes. With a concerted effort from basic scientists, human physicians and veterinarians, comparative oncology has the potential to enhance the cost-effectiveness and efficiency of pipelines for cancer drug discovery and other cancer treatments.}, number={1}, journal={MOLECULAR BIOLOGY AND EVOLUTION}, author={Somarelli, Jason A. and Boddy, Amy M. and Gardner, Heather L. and DeWitt, Suzanne Bartholf and Tuohy, Joanne and Megquier, Kate and Sheth, Maya U. and Hsu, Shiaowen David and Thorne, Jeffrey L. and London, Cheryl A. and et al.}, year={2020}, month={Jan}, pages={11–17} } @article{larson_thorne_schmidler_2018, title={Modeling Dependence in Evolutionary Inference for Proteins}, volume={10812}, ISBN={["978-3-319-89928-2"]}, ISSN={["1611-3349"]}, DOI={10.1007/978-3-319-89929-9_8}, abstractNote={Protein structure alignment is a classic problem of computational biology, and is widely used to identify structural and functional similarity and to infer homology among proteins. Previously a statistical model for protein structural evolution has been introduced and shown to significantly improve phylogenetic inferences compared to approaches that utilize only amino acid sequence information. Here we extend this model to account for correlated evolutionary drift among neighboring amino acid positions, resulting in a spatio-temporal model of protein structure evolution. The result is a multivariate diffusion process convolved with a spatial birth-death process, which comes with little additional computational cost or analytical complexity compared to the site-independent model (SIM). We demonstrate that this extended, site-dependent model (SDM) yields a significant reduction of bias in estimated evolutionary distances and helps further improve phylogenetic tree reconstruction.}, journal={RESEARCH IN COMPUTATIONAL MOLECULAR BIOLOGY, RECOMB 2018}, author={Larson, Gary and Thorne, Jeffrey L. and Schmidler, Scott}, year={2018}, pages={122–137} } @article{ji_griffing_thorne_2016, title={A Phylogenetic Approach Finds Abundant Interlocus Gene Conversion in Yeast}, volume={33}, ISSN={["1537-1719"]}, DOI={10.1093/molbev/msw114}, abstractNote={Interlocus gene conversion (IGC) homogenizes repeats. While genomes can be repeat-rich, the evolutionary importance of IGC is poorly understood. Additional statistical tools for characterizing it are needed. We propose a composite likelihood strategy for incorporating IGC into widely-used probabilistic models for sequence changes that originate with point mutation. We estimated the percentage of nucleotide substitutions that originate with an IGC event rather than a point mutation in 14 groups of yeast ribosomal protein-coding genes, and found values ranging from 20% to 38%. We designed and applied a procedure to determine whether these percentages are inflated due to artifacts arising from model misspecification. The results of this procedure are consistent with IGC having had an important role in the evolution of each of these 14 gene families. We further investigate the properties of our IGC approach via simulation. In contrast to usual practice, our findings suggest that the IGC should and can be considered when multigene family evolution is investigated.}, number={9}, journal={MOLECULAR BIOLOGY AND EVOLUTION}, author={Ji, Xiang and Griffing, Alexander and Thorne, Jeffrey L.}, year={2016}, month={Sep}, pages={2469–2476} } @article{lee_kishino_rodrigue_thorne_2016, title={Grouping substitution types into different relaxed molecular clocks}, volume={371}, number={1699}, journal={Philosophical Transactions of the Royal Society of London. Series B, Biological Sciences}, author={Lee, H. J. and Kishino, H. and Rodrigue, N. and Thorne, J. L.}, year={2016} } @article{lassiter_russ_nusbaum_zeng_saville_olarte_carbone_hu_seguin-orlando_samaniego_et al._2015, title={Mitochondrial genome sequences reveal evolutionary relationships of the Phytophthora 1c clade species}, volume={61}, ISSN={0172-8083 1432-0983}, url={http://dx.doi.org/10.1007/s00294-015-0480-3}, DOI={10.1007/s00294-015-0480-3}, abstractNote={Phytophthora infestans is one of the most destructive plant pathogens of potato and tomato globally. The pathogen is closely related to four other Phytophthora species in the 1c clade including P. phaseoli, P. ipomoeae, P. mirabilis and P. andina that are important pathogens of other wild and domesticated hosts. P. andina is an interspecific hybrid between P. infestans and an unknown Phytophthora species. We have sequenced mitochondrial genomes of the sister species of P. infestans and examined the evolutionary relationships within the clade. Phylogenetic analysis indicates that the P. phaseoli mitochondrial lineage is basal within the clade. P. mirabilis and P. ipomoeae are sister lineages and share a common ancestor with the Ic mitochondrial lineage of P. andina. These lineages in turn are sister to the P. infestans and P. andina Ia mitochondrial lineages. The P. andina Ic lineage diverged much earlier than the P. andina Ia mitochondrial lineage and P. infestans. The presence of two mitochondrial lineages in P. andina supports the hybrid nature of this species. The ancestral state of the P. andina Ic lineage in the tree and its occurrence only in the Andean regions of Ecuador, Colombia and Peru suggests that the origin of this species hybrid in nature may occur there.}, number={4}, journal={Current Genetics}, publisher={Springer Science and Business Media LLC}, author={Lassiter, Erica S. and Russ, Carsten and Nusbaum, Chad and Zeng, Qiandong and Saville, Amanda C. and Olarte, Rodrigo A. and Carbone, Ignazio and Hu, Chia-Hui and Seguin-Orlando, Andaine and Samaniego, Jose A. and et al.}, year={2015}, month={Mar}, pages={567–577} } @article{lee_rodrigue_thorne_2015, title={Relaxing the Molecular Clock to Different Degrees for Different Substitution Types}, volume={32}, ISSN={["1537-1719"]}, DOI={10.1093/molbev/msv099}, abstractNote={Rates of molecular evolution can vary over time. Diverse statistical techniques for divergence time estimation have been developed to accommodate this variation. These typically require that all sequence (or codon) positions at a locus change independently of one another. They also generally assume that the rates of different types of nucleotide substitutions vary across a phylogeny in the same way. This permits divergence time estimation procedures to employ an instantaneous rate matrix with relative rates that do not differ among branches. However, previous studies have suggested that some substitution types (e.g., CpG to TpG changes in mammals) are more clock-like than others. As has been previously noted, this is biologically plausible given the mutational mechanism of CpG to TpG changes. Through stochastic mapping of sequence histories from context-independent substitution models, our approach allows for context-dependent nucleotide substitutions to change their relative rates over time. We apply our approach to the analysis of a 0.15 Mb intergenic region from eight primates. In accord with previous findings, we find comparatively little rate variation over time for CpG to TpG substitutions but we find more for other substitution types. We conclude by discussing the limitations and prospects of our approach.}, number={8}, journal={MOLECULAR BIOLOGY AND EVOLUTION}, author={Lee, Hui-Jie and Rodrigue, Nicolas and Thorne, Jeffrey L.}, year={2015}, month={Aug}, pages={1948–1961} } @article{wang_yu_ji_lakner_griffing_thorne_2015, title={Roles of Solvent Accessibility and Gene Expression in Modeling Protein Sequence Evolution}, volume={11}, ISSN={["1176-9343"]}, DOI={10.4137/ebo.s22911}, abstractNote={Models of protein evolution tend to ignore functional constraints, although structural constraints are sometimes incorporated. Here we propose a probabilistic framework for codon substitution that evaluates joint effects of relative solvent accessibility (RSA), a structural constraint; and gene expression, a functional constraint. First, we explore the relationship between RSA and codon usage at the genomic scale as well as at the individual gene scale. Motivated by these results, we construct our framework by determining how probable is an amino acid, given RSA and gene expression, and then evaluating the relative probability of observing a codon compared to other synonymous codons. We come to the biologically plausible conclusion that both RSA and gene expression are related to amino acid frequencies, but, among synonymous codons, the relative probability of a particular codon is more closely related to gene expression than RSA. To illustrate the potential applications of our framework, we propose a new codon substitution model. Using this model, we obtain estimates of 27 N s, the product of effective population size N, and relative fitness difference of allele s. For a training data set consisting of human proteins with known structures and expression data, 2 N s is estimated separately for synonymous and nonsynonymous substitutions in each protein. We then contrast the patterns of synonymous and nonsynonymous 2 N s estimates across proteins while also taking gene expression levels of the proteins into account. We conclude that our 2 N s estimates are too concentrated around 0, and we discuss potential explanations for this lack of variability.}, journal={EVOLUTIONARY BIOINFORMATICS}, author={Wang, Kuangyu and Yu, Shuhui and Ji, Xiang and Lakner, Clemens and Griffing, Alexander and Thorne, Jeffrey L.}, year={2015} } @article{hobolth_thorne_2014, title={Sampling and summary statistics of endpoint-conditioned paths in DNA sequence evolution}, journal={Bayesian phylogenetics: methods, algorithms, and applications}, author={Hobolth, A. and Thorne, J. L.}, year={2014}, pages={247–275} } @article{liberles_teichmann_bahar_bastolla_bloom_bornberg-bauer_colwell_de koning_dokholyan_echave_et al._2012, title={The interface of protein structure, protein biophysics, and molecular evolution}, volume={21}, ISSN={0961-8368}, url={http://dx.doi.org/10.1002/pro.2071}, DOI={10.1002/pro.2071}, abstractNote={AbstractAbstract The interface of protein structural biology, protein biophysics, molecular evolution, and molecular population genetics forms the foundations for a mechanistic understanding of many aspects of protein biochemistry. Current efforts in interdisciplinary protein modeling are in their infancy and the state‐of‐the art of such models is described. Beyond the relationship between amino acid substitution and static protein structure, protein function, and corresponding organismal fitness, other considerations are also discussed. More complex mutational processes such as insertion and deletion and domain rearrangements and even circular permutations should be evaluated. The role of intrinsically disordered proteins is still controversial, but may be increasingly important to consider. Protein geometry and protein dynamics as a deviation from static considerations of protein structure are also important. Protein expression level is known to be a major determinant of evolutionary rate and several considerations including selection at the mRNA level and the role of interaction specificity are discussed. Lastly, the relationship between modeling and needed high‐throughput experimental data as well as experimental examination of protein evolution using ancestral sequence resurrection and in vitro biochemistry are presented, towards an aim of ultimately generating better models for biological inference and prediction.}, number={6}, journal={Protein Science}, publisher={Wiley}, author={Liberles, David A. and Teichmann, Sarah A. and Bahar, Ivet and Bastolla, Ugo and Bloom, Jesse and Bornberg-Bauer, Erich and Colwell, Lucy J. and de Koning, A. P. Jason and Dokholyan, Nikolay V. and Echave, Julian and et al.}, year={2012}, month={Apr}, pages={769–785} } @misc{liberles_teichmann_bahar_bastolla_bloom_bornberg-bauer_colwell_koning_dokholyan_echave_et al._2012, title={The interface of protein structure, protein biophysics, and molecular evolution}, volume={21}, number={6}, journal={Protein Science}, author={Liberles, D. A. and Teichmann, S. A. and Bahar, I. and Bastolla, U. and Bloom, J. and Bornberg-Bauer, E. and Colwell, L. J. and Koning, A. P. J. and Dokholyan, N. V. and Echave, J. and et al.}, year={2012}, pages={769–785} } @article{cartwright_lartillot_thorne_2011, title={History Can Matter: Non-Markovian Behavior of Ancestral Lineages}, volume={60}, ISSN={["1076-836X"]}, DOI={10.1093/sysbio/syr012}, abstractNote={Although most of the important evolutionary events in the history of biology can only be studied via interspecific comparisons, it is challenging to apply the rich body of population genetic theory to the study of interspecific genetic variation. Probabilistic modeling of the substitution process would ideally be derived from first principles of population genetics, allowing a quantitative connection to be made between the parameters describing mutation, selection, drift, and the patterns of interspecific variation. There has been progress in reconciling population genetics and interspecific evolution for the case where mutation rates are sufficiently low, but when mutation rates are higher, reconciliation has been hampered due to complications from how the loss or fixation of new mutations can be influenced by linked nonneutral polymorphisms (i.e., the Hill-Robertson effect). To investigate the generation of interspecific genetic variation when concurrent fitness-affecting polymorphisms are common and the Hill-Robertson effect is thereby potentially strong, we used the Wright-Fisher model of population genetics to simulate very many generations of mutation, natural selection, and genetic drift. This was done so that the chronological history of advantageous, deleterious, and neutral substitutions could be traced over time along the ancestral lineage. Our simulations show that the process by which a nonrecombining sequence changes over time can markedly deviate from the Markov assumption that is ubiquitous in molecular phylogenetics. In particular, we find tendencies for advantageous substitutions to be followed by deleterious ones and for deleterious substitutions to be followed by advantageous ones. Such non-Markovian patterns reflect the fact that the fate of the ancestral lineage depends not only on its current allelic state but also on gene copies not belonging to the ancestral lineage. Although our simulations describe nonrecombining sequences, we conclude by discussing how non-Markovian behavior of the ancestral lineage is plausible even when recombination rates are not low. As a result, we believe that increased attention needs to be devoted to the robustness of evolutionary inference procedures that rely upon the Markov assumption.}, number={3}, journal={SYSTEMATIC BIOLOGY}, author={Cartwright, Reed A. and Lartillot, Nicolas and Thorne, Jeffrey L.}, year={2011}, month={May}, pages={276–290} } @article{yokoyama_thorne_wray_2011, title={Coordinated Genome-Wide Modifications within Proximal Promoter Cis-regulatory Elements during Vertebrate Evolution}, volume={3}, ISSN={["1759-6653"]}, DOI={10.1093/gbe/evq078}, abstractNote={There often exists a “one-to-many” relationship between a transcription factor and a multitude of binding sites throughout the genome. It is commonly assumed that transcription factor binding motifs remain largely static over the course of evolution because changes in binding specificity can alter the interactions with potentially hundreds of sites across the genome. Focusing on regulatory motifs overrepresented at specific locations within or near the promoter, we find that a surprisingly large number of cis-regulatory elements have been subject to coordinated genome-wide modifications during vertebrate evolution, such that the motif frequency changes on a single branch of vertebrate phylogeny. This was found to be the case even between closely related mammal species, with nearly a third of all location-specific consensus motifs exhibiting significant modifications within the human or mouse lineage since their divergence. Many of these modifications are likely to be compensatory changes throughout the genome following changes in protein factor binding affinities, whereas others may be due to changes in mutation rates or effective population size. The likelihood that this happened many times during vertebrate evolution highlights the need to examine additional taxa and to understand the evolutionary and molecular mechanisms underlying the evolution of protein–DNA interactions.}, journal={GENOME BIOLOGY AND EVOLUTION}, author={Yokoyama, Ken Daigoro and Thorne, Jeffrey L. and Wray, Gregory A.}, year={2011}, pages={66–74} } @article{choi_stone_kishino_thorne_2009, title={Estimates of natural selection due to protein tertiary structure inform the ancestry of biallelic loci}, volume={441}, DOI={10.1016/j.gene.2008.07.020}, abstractNote={We consider the inference of which of two alleles is ancestral when the alleles have a single nonsynonymous difference and when natural selection acts via protein tertiary structure. Whereas the probability that an allele is ancestral under neutrality is equal to its frequency, under selection this probability depends on allele frequency and on the magnitude and direction of selection pressure. Although allele frequencies can be well estimated from intraspecific data, small fitness differences have a large evolutionary impact but can be difficult to estimate with only intraspecific data. Methods for predicting aspects of phenotype from genotype can supplement intraspecific sequence data. Recently developed statistical techniques can assess effects of phenotypes, such as protein tertiary structure on molecular evolution. While these techniques were initially designed for comparing protein-coding genes from different species, the resulting interspecific inferences can be assigned population genetic interpretations to assess the effect of selection pressure, and we use them here along with intraspecific allele frequency data to estimate the probability that an allele is ancestral. We focus on 140 nonsynonymous single nucleotide polymorphisms of humans that are in proteins with known tertiary structures. We find that our technique for employing protein tertiary structure information yields some biologically plausible results but that it does not substantially improve the inference of ancestral human allele types.}, number={1-2}, journal={Gene}, author={Choi, S. C. and Stone, E. A. and Kishino, H. and Thorne, J. L.}, year={2009}, pages={45–52} } @article{choi_redelings_thorne_2008, title={Basing population genetic inferences and models of molecular evolution upon desired stationary distributions of DNA or protein sequences}, volume={363}, ISSN={["0962-8436"]}, DOI={10.1098/rstb.2008.0167}, abstractNote={Models of molecular evolution tend to be overly simplistic caricatures of biology that are prone to assigning high probabilities to biologically implausible DNA or protein sequences. Here, we explore how to construct time-reversible evolutionary models that yield stationary distributions of sequences that match given target distributions. By adopting comparatively realistic target distributions, evolutionary models can be improved. Instead of focusing on estimating parameters, we concentrate on the population genetic implications of these models. Specifically, we obtain estimates of the product of effective population size and relative fitness difference of alleles. The approach is illustrated with two applications to protein-coding DNA. In the first, a codon-based evolutionary model yields a stationary distribution of sequences, which, when the sequences are translated, matches a variable-length Markov model trained on human proteins. In the second, we introduce an insertion–deletion model that describes selectively neutral evolutionary changes to DNA. We then show how to modify the neutral model so that its stationary distribution at the amino acid level can match a profile hidden Markov model, such as the one associated with the Pfam database.}, number={1512}, journal={PHILOSOPHICAL TRANSACTIONS OF THE ROYAL SOCIETY B-BIOLOGICAL SCIENCES}, author={Choi, Sang Chul and Redelings, Benjamin D. and Thorne, Jeffrey L.}, year={2008}, month={Dec}, pages={3931–3939} } @article{xiang_thorne_seo_zhang_thomas_ricklefs_2008, title={Rates of nucleotide substitution in Cornaceae (Cornales)—Pattern of variation and underlying causal factors}, volume={49}, ISSN={10557903}, url={https://linkinghub.elsevier.com/retrieve/pii/S1055790308003606}, DOI={10.1016/j.ympev.2008.07.010}, abstractNote={Identifying causes of genetic divergence is a central goal in evolutionary biology. Although rates of nucleotide substitution vary among taxa and among genes, the causes of this variation tend to be poorly understood. In the present study, we examined the rate and pattern of molecular evolution for five DNA regions over a phylogeny of Cornus, the single genus of Cornaceae. To identify evolutionary mechanisms underlying the molecular variation, we employed Bayesian methods to estimate divergence times and to infer how absolute rates of synonymous and nonsynonymous substitutions and their ratios change over time. We found that the rates vary among genes, lineages, and through time, and differences in mutation rates, selection type and intensity, and possibly genetic drift all contributed to the variation of substitution rates observed among the major lineages of Cornus. We applied independent contrast analysis to explore whether speciation rates are linked to rates of molecular evolution. The results showed no relationships for individual genes, but suggested a possible localized link between species richness and rate of nonsynonymous nucleotide substitution for the combined cpDNA regions. Furthermore, we detected a positive correlation between rates of molecular evolution and morphological change in Cornus. This was particularly pronounced in the dwarf dogwood lineage, in which genome-wide acceleration in both molecular and morphological evolution has likely occurred.}, number={1}, journal={Molecular Phylogenetics and Evolution}, author={Xiang, Qiu-Yun (Jenny) and Thorne, Jeffrey L. and Seo, Tae-Kun and Zhang, Wenheng and Thomas, David T. and Ricklefs, Robert E.}, year={2008}, month={Oct}, pages={327–342} } @article{lin_fang_thorne_2007, title={A tabu search algorithm for maximum parsimony phylogeny inference}, volume={176}, ISSN={["1872-6860"]}, DOI={10.1016/j.ejor.2005.10.031}, abstractNote={Phylogeny reconstruction is too complex a combinatorial problem for an exhaustive search, because the number of possible solutions increases exponentially with the number of taxa involved. In this paper, we adopt the parsimony principle and design a tabu search algorithm for finding a most parsimonious phylogeny tree. A special array structure is employed to represent the topology of trees and to generate the neighboring trees. We test the proposed tabu search algorithm on randomly selected data sets obtained from nuclear ribosomal DNA sequence data. The experiments show that our algorithm explores fewer trees to reach the optimal one than the commonly used program “dnapenny” (branch-and-bound based) while it generates much more accurate results than the default options of the program “dnapars” (heuristic search based). The percentage of search space needed to find the best solution for our algorithm decreased rapidly as the number of taxa increased. For a 20-taxon phylogeny problem, it needs on average to examine only 3.92 × 10−15% of the sample space.}, number={3}, journal={EUROPEAN JOURNAL OF OPERATIONAL RESEARCH}, author={Lin, Yu-Min and Fang, Shu-Cherng and Thorne, Jeffrey L.}, year={2007}, month={Feb}, pages={1908–1917} } @article{thorne_choi_yu_higgs_kishino_2007, title={Population genetics without intraspecific data}, volume={24}, ISSN={["1537-1719"]}, DOI={10.1093/molbev/msm085}, abstractNote={A central goal of computational biology is the prediction of phenotype from DNA and protein sequence data. Recent models of sequence change use in silico prediction systems to incorporate the effects of phenotype on evolutionary rates. These models have been designed for analyzing sequence data from different species and have been accompanied by statistical techniques for estimating model parameters when the incorporation of phenotype induces dependent change among sequence positions. A difficulty with these efforts to link phenotype and interspecific evolution is that evolution occurs within populations, and parameters of interspecific models should have population genetic interpretations. We show, with two examples, how population genetic interpretations can be assigned to evolutionary models. The first example considers the impact of RNA secondary structure on sequence change, and the second reflects the tendency for protein tertiary structure to influence nonsynonymous substitution rates. We argue that statistical fit to data should not be the sole criterion for assessing models of sequence change. A good interspecific model should also yield a clear and biologically plausible population genetic interpretation.}, number={8}, journal={MOLECULAR BIOLOGY AND EVOLUTION}, author={Thorne, Jeffrey L. and Choi, Sang Chul and Yu, Jiaye and Higgs, Paul G. and Kishino, Hirohisa}, year={2007}, month={Aug}, pages={1667–1677} } @article{thorne_2007, title={Protein evolution constraints and model-based techniques to study them}, volume={17}, ISSN={["1879-033X"]}, DOI={10.1016/j.sbi.2007.05.006}, abstractNote={There have been substantial improvements in statistical tools for assessing the evolutionary roles of mutation and natural selection from interspecific sequence data. The importance of having the rate at which a point mutation occurs depend on the DNA sequence at sites surrounding the mutation is now better appreciated and can be accommodated in probabilistic models of protein evolution. To quantify the evolutionary impact of some aspect of phenotype, one promising strategy is to develop a system for predicting phenotype from the DNA sequence and to then infer how the evolutionary rates of sequence change are affected by the predicted phenotypic consequences of the changes. Although statistical tools for characterizing protein evolution are improving, the list of candidate phenomena that can affect rates of protein evolution is long and the relative contributions of these phenomena are only beginning to be disentangled.}, number={3}, journal={CURRENT OPINION IN STRUCTURAL BIOLOGY}, author={Thorne, Jeffrey L.}, year={2007}, month={Jun}, pages={337–341} } @article{choi_hobolth_robinson_kishino_thorne_2007, title={Quantifying the impact of protein tertiary structure on molecular evolution}, volume={24}, ISSN={["0737-4038"]}, DOI={10.1093/molbev/msm097}, abstractNote={To investigate the evolutionary impact of protein structure, the experimentally determined tertiary structure and the protein-coding DNA sequence were collected for each of 1,195 genes. These genes were studied via a model of sequence change that explicitly incorporates effects on evolutionary rates due to protein tertiary structure. In the model, these effects act via the solvent accessibility environments and pairwise amino acid interactions that are induced by tertiary structure. To compare the hypotheses that structure does and does not have a strong influence on evolution, Bayes factors were estimated for each of the 1,195 sequences. Most of the Bayes factors strongly support the hypothesis that protein structure affects protein evolution. Furthermore, both solvent accessibility and pairwise interactions among amino acids are inferred to have important roles in protein evolution. Our results also indicate that the strength of the relationship between tertiary structure and evolution has a weak but real correlation to the annotation information in the Gene Ontology database. Although their influences on rates of evolution vary among protein families, we find that the mean impacts of solvent accessibility and pairwise interactions are about the same.}, number={8}, journal={MOLECULAR BIOLOGY AND EVOLUTION}, author={Choi, Sang Chul and Hobolth, Asger and Robinson, Douglas M. and Kishino, Hirohisa and Thorne, Jeffrey L.}, year={2007}, month={Aug}, pages={1769–1782} } @article{yu_thorne_2006, title={Dependence among sites in RNA evolution}, volume={23}, ISSN={["1537-1719"]}, DOI={10.1093/molbev/msl015}, abstractNote={Although probabilistic models of genotype (e.g., DNA sequence) evolution have been greatly elaborated, less attention has been paid to the effect of phenotype on the evolution of the genotype. Here we propose an evolutionary model and a Bayesian inference procedure that are aimed at filling this gap. In the model, RNA secondary structure links genotype and phenotype by treating the approximate free energy of a sequence folded into a secondary structure as a surrogate for fitness. The underlying idea is that a nucleotide substitution resulting in a more stable secondary structure should have a higher rate than a substitution that yields a less stable secondary structure. This free energy approach incorporates evolutionary dependencies among sequence positions beyond those that are reflected simply by jointly modeling change at paired positions in an RNA helix. Although there is not a formal requirement with this approach that secondary structure be known and nearly invariant over evolutionary time, computational considerations make these assumptions attractive and they have been adopted in a software program that permits statistical analysis of multiple homologous sequences that are related via a known phylogenetic tree topology. Analyses of 5S ribosomal RNA sequences are presented to illustrate and quantify the strong impact that RNA secondary structure has on substitution rates. Analyses on simulated sequences show that the new inference procedure has reasonable statistical properties. Potential applications of this procedure, including improved ancestral sequence inference and location of functionally interesting sites, are discussed.}, number={8}, journal={MOLECULAR BIOLOGY AND EVOLUTION}, author={Yu, Jiaye and Thorne, Jeffrey L.}, year={2006}, month={Aug}, pages={1525–1537} } @article{yu_thorne_2006, title={Testing for spatial clustering of amino acid replacements within protein tertiary structure}, volume={62}, ISSN={["1432-1432"]}, DOI={10.1007/s00239-005-0107-2}, abstractNote={Widely used models of protein evolution ignore protein structure. Therefore, these models do not predict spatial clustering of amino acid replacements with respect to tertiary structure. One formal and biologically implausible possibility is that there is no tendency for amino acid replacements to be spatially clustered during evolution. An alternative to this is that amino acid replacements are spatially clustered and this spatial clustering can be fully explained by a tendency for similar rates of amino acid replacement at sites that are nearby in protein tertiary structure. A third possibility is that the amount of clustering exceeds that which can be explained solely on the basis of independently evolving protein sites with spatially clustered replacement rates. We introduce two simple and not very parametric hypothesis tests that help distinguish these three possibilities. We then apply these tests to 273 homologous protein families. The null hypothesis of no spatial clustering is rejected for 102 of 273 families. The explanation of spatially clustered rates but independent change among sites is rejected for 43 families. These findings need to be reconciled with the common practice of basing evolutionary inferences on models that assume independent change among sites.}, number={6}, journal={JOURNAL OF MOLECULAR EVOLUTION}, author={Yu, Jiaye and Thorne, Jeffrey L.}, year={2006}, month={Jun}, pages={682–692} } @article{seo_kishino_thorne_2005, title={Incorporating gene-specific variation when inferring and evaluating optimal evolutionary tree topologies from multilocus sequence data}, volume={102}, ISSN={["0027-8424"]}, DOI={10.1073/pnas.0408313102}, abstractNote={Because of the increase of genomic data, multiple genes are often available for the inference of phylogenetic relationships. The simple approach for combining multiple genes from the same taxon is to concatenate the sequences and then ignore the fact that different positions in the concatenated sequence came from different genes. Here, we discuss two criteria for inferring the optimal tree topology from data sets with multiple genes. These criteria are designed for multigene data sets where gene-specific evolutionary features are too important to ignore. One criterion is conventional and is obtained by taking the sum of log-likelihoods over all genes. The other criterion is obtained by dividing the log-likelihood for a gene by its sequence length and then taking the arithmetic mean over genes of these ratios. A similar strategy could be adopted with parsimony scores. The optimal tree is then declared to be the one for which the sum or the arithmetic mean is maximized. These criteria are justified within a two-stage hierarchical framework. The first level of the hierarchy represents gene-specific evolutionary features, and the second represents site-specific features for given genes. For testing significance of the optimal topology, we suggest a two-stage bootstrap procedure that involves resampling genes and then resampling alignment columns within resampled genes. An advantage of this procedure over concatenation is that it can effectively account for gene-specific evolutionary features. We discuss the applicability of the two-stage bootstrap idea to the Kishino–Hasegawa test and the Shimodaira–Hasegawa test.}, number={12}, journal={PROCEEDINGS OF THE NATIONAL ACADEMY OF SCIENCES OF THE UNITED STATES OF AMERICA}, author={Seo, TK and Kishino, H and Thorne, JL}, year={2005}, month={Mar}, pages={4436–4441} } @article{thorne_2004, title={A Bayesian approach to DNA sequence segmentation - Discussion}, volume={60}, ISSN={["0006-341X"]}, DOI={10.1111/j.0006-341X.2004.206_5.x}, abstractNote={.}, number={3}, journal={BIOMETRICS}, author={Thorne, JL}, year={2004}, month={Sep}, pages={584–585} } @article{seo_kishino_thorne_2004, title={Estimating absolute rates of synonymous and nonsynonymous nucleotide substitution in order to characterize natural selection and date species divergences}, volume={21}, ISSN={["1537-1719"]}, DOI={10.1093/molbev/msh088}, abstractNote={The rate of molecular evolution can vary among lineages. Sources of this variation have differential effects on synonymous and nonsynonymous substitution rates. Changes in effective population size or patterns of natural selection will mainly alter nonsynonymous substitution rates. Changes in generation length or mutation rates are likely to have an impact on both synonymous and nonsynonymous substitution rates. By comparing changes in synonymous and nonsynonymous rates, the relative contributions of the driving forces of evolution can be better characterized. Here, we introduce a procedure for estimating the chronological rates of synonymous and nonsynonymous substitutions on the branches of an evolutionary tree. Because the widely used ratio of nonsynonymous and synonymous rates is not designed to detect simultaneous increases or simultaneous decreases in synonymous and nonsynonymous rates, the estimation of these rates rather than their ratio can improve characterization of the evolutionary process. With our Bayesian approach, we analyze cytochrome oxidase subunit I evolution in primates and infer that nonsynonymous rates have a greater tendency to change over time than do synonymous rates. Our analysis of these data also suggests that rates have been positively correlated.}, number={7}, journal={MOLECULAR BIOLOGY AND EVOLUTION}, author={Seo, TK and Kishino, H and Thorne, JL}, year={2004}, month={Jul}, pages={1201–1213} } @misc{sanderson_thorne_wikstrom_bremer_2004, title={Molecular evidence on plant divergence times}, volume={91}, ISSN={["1537-2197"]}, DOI={10.3732/ajb.91.10.1656}, abstractNote={Estimation of divergence times from sequence data has become increasingly feasible in recent years. Conflicts between fossil evidence and molecular dates have sparked the development of new methods for inferring divergence times, further encouraging these efforts. In this paper, available methods for estimating divergence times are reviewed, especially those geared toward handling the widespread variation in rates of molecular evolution observed among lineages. The assumptions, strengths, and weaknesses of local clock, Bayesian, and rate smoothing methods are described. The rapidly growing literature applying these methods to key divergence times in plant evolutionary history is also reviewed. These include the crown group ages of green plants, land plants, seed plants, angiosperms, and major subclades of angiosperms. Finally, attempts to infer divergence times are described in the context of two very different temporal settings: recent adaptive radiations and much more ancient biogeographic patterns.}, number={10}, journal={AMERICAN JOURNAL OF BOTANY}, author={Sanderson, MJ and Thorne, JL and Wikstrom, N and Bremer, K}, year={2004}, month={Oct}, pages={1656–1665} } @article{scholl_thorne_mccarter_bird_2003, title={Horizontally transferred genes in plant-parasitic nematodes: a high-throughput genomic approach}, volume={4}, number={6}, journal={Genome Biology}, author={Scholl, E. H. and Thorne, J. L. and McCarter, J. P. and Bird, D. M.}, year={2003}, pages={R39–1} } @article{robinson_jones_kishino_goldman_thorne_2003, title={Protein evolution with dependence among codons due to tertiary structure}, volume={20}, ISSN={["0737-4038"]}, DOI={10.1093/molbev/msg184}, abstractNote={Markovian models of protein evolution that relax the assumption of independent change among codons are considered. With this comparatively realistic framework, an evolutionary rate at a site can depend both on the state of the site and on the states of surrounding sites. By allowing a relatively general dependence structure among sites, models of evolution can reflect attributes of tertiary structure. To quantify the impact of protein structure on protein evolution, we analyze protein-coding DNA sequence pairs with an evolutionary model that incorporates effects of solvent accessibility and pairwise interactions among amino acid residues. By explicitly considering the relationship between nonsynonymous substitution rates and protein structure, this approach can lead to refined detection and characterization of positive selection. Analyses of simulated sequence pairs indicate that parameters in this evolutionary model can be well estimated. Analyses of lysozyme c and annexin V sequence pairs yield the biologically reasonable result that amino acid replacement rates are higher when the replacements lead to energetically favorable proteins than when they destabilize the proteins. Although the focus here is evolutionary dependence among codons that is associated with protein structure, the statistical approach is quite general and could be applied to diverse cases of evolutionary dependence where surrogates for sequence fitness can be measured or modeled.}, number={10}, journal={MOLECULAR BIOLOGY AND EVOLUTION}, author={Robinson, DM and Jones, DT and Kishino, H and Goldman, N and Thorne, JL}, year={2003}, month={Oct}, pages={1692–1704} } @article{wiegmann_yeates_thorne_kishino_2003, title={Time flies, a new molecular time-scale for brachyceran fly evolution without a clock}, volume={52}, number={6}, journal={Systematic Biology}, author={Wiegmann, B. M. and Yeates, D. K. and Thorne, J. L. and Kishino, H.}, year={2003}, pages={745–756} } @misc{hasegawa_thorne_kishino_2003, title={Time scale of eutherian evolution estimated without assuming a constant rate of molecular evolution}, volume={78}, ISSN={["1880-5779"]}, DOI={10.1266/ggs.78.267}, abstractNote={Controversies over the molecular clock hypothesis were reviewed. Since it is evident that the molecular clock does not hold in an exact sense, accounting for evolution of the rate of molecular evolution is a prerequisite when estimating divergence times with molecular sequences. Recently proposed statistical methods that account for this rate variation are overviewed and one of these procedures is applied to the mitochondrial protein sequences and to the nuclear gene sequences from many mammalian species in order to estimate the time scale of eutherian evolution. This Bayesian method not only takes account of the variation of molecular evolutionary rate among lineages and among genes, but it also incorporates fossil evidence via constraints on node times. With denser taxonomic sampling and a more realistic model of molecular evolution, this Bayesian approach is expected to increase the accuracy of divergence time estimates.}, number={4}, journal={GENES & GENETIC SYSTEMS}, author={Hasegawa, M and Thorne, JL and Kishino, H}, year={2003}, month={Aug}, pages={267–283} } @article{seo_thorne_hasegawa_kishino_2002, title={A viral sampling design for testing the molecular clock and for estimating evolutionary rates and divergence times}, volume={18}, ISSN={["1367-4803"]}, DOI={10.1093/bioinformatics/18.1.115}, abstractNote={Abstract Motivation: The high pace of viral sequence change means that variation in the times at which sequences are sampled can have a profound effect both on the ability to detect trends over time in evolutionary rates and on the power to reject the Molecular Clock Hypothesis (MCH). Trends in viral evolutionary rates are of particular interest because their detection may allow connections to be established between a patient’s treatment or condition and the process of evolution. Variation in sequence isolation times also impacts the uncertainty associated with estimates of divergence times and evolutionary rates. Variation in isolation times can be intentionally adjusted to increase the power of hypothesis tests and to reduce the uncertainty of evolutionary parameter estimates, but this fact has received little previous attention. Results: We provide approximations for the power to reject the MCH when the alternative is that rates change in a linear fashion over time and when the alternative is that rates differ randomly among branches. In addition, we approximate the standard deviation of estimated evolutionary rates and divergence times. We illustrate how these approximations can be exploited to determine which viral sample to sequence when samples representing different dates are available. Contact: seo@ism.ac.jp; thorne@statgen.ncsu.edu; hasegawa@ism.ac.jp; kishino@wheat.ab.a.u-tokyo.ac.jp * To whom correspondence should be addressed.}, number={1}, journal={BIOINFORMATICS}, author={Seo, TK and Thorne, JL and Hasegawa, M and Kishino, H}, year={2002}, month={Jan}, pages={115–123} } @article{thorne_kishino_2002, title={Divergence time and evolutionary rate estimation with multilocus data}, volume={51}, ISSN={["1076-836X"]}, DOI={10.1080/10635150290102456}, abstractNote={Bayesian methods for estimating evolutionary divergence times are extended to multigene data sets, and a technique is described for detecting correlated changes in evolutionary rates among genes. Simulations are employed to explore the effect of multigene data on divergence time estimation, and the methodology is illustrated with a previously published data set representing diverse plant taxa. The fact that evolutionary rates and times are confounded when sequence data are compared is emphasized and the importance of fossil information for disentangling rates and times is stressed.}, number={5}, journal={SYSTEMATIC BIOLOGY}, author={Thorne, JL and Kishino, H}, year={2002}, pages={689–702} } @article{seo_thorne_hasegawa_kishino_2002, title={Estimation of effective population size of HIV-1 within a host: A pseudomaximum-likelihood approach}, volume={160}, number={4}, journal={Genetics}, author={Seo, T. K. and Thorne, J. L. and Hasegawa, M. and Kishino, H.}, year={2002}, pages={1283–1293} } @article{kishino_thorne_bruno_2001, title={Performance of a divergence time estimation method under a probabilistic model of rate evolution}, volume={18}, ISSN={["0737-4038"]}, DOI={10.1093/oxfordjournals.molbev.a003811}, abstractNote={Rates of molecular evolution vary over time and, hence, among lineages. In contrast, widely used methods for estimating divergence times from molecular sequence data assume constancy of rates. Therefore, methods for estimation of divergence times that incorporate rate variation are attractive. Improvements on a previously proposed Bayesian technique for divergence time estimation are described. New parameterization more effectively captures the phylogenetic structure of rate evolution on a tree. Fossil information and other evidence can now be included in Bayesian analyses in the form of constraints on divergence times. Simulation results demonstrate that the accuracy of divergence time estimation is substantially enhanced when constraints are included.}, number={3}, journal={MOLECULAR BIOLOGY AND EVOLUTION}, author={Kishino, H and Thorne, JL and Bruno, WJ}, year={2001}, month={Mar}, pages={352–361} } @article{thorne_2000, title={Models of protein sequence evolution and their applications}, volume={10}, ISSN={["1879-0380"]}, DOI={10.1016/S0959-437X(00)00142-8}, abstractNote={Homologous sequences are correlated due to their common ancestry. Probabilistic models of sequence evolution are employed routinely to properly account for these phylogenetic correlations. These increasingly realistic models provide a basis for studying evolution and for exploiting it to better understand protein structure and function. Notable recent advances have been made in the treatment of insertion and deletion events, the estimation of amino-acid replacement rates, and the detection of positive selection.}, number={6}, journal={CURRENT OPINION IN GENETICS & DEVELOPMENT}, author={Thorne, JL}, year={2000}, month={Dec}, pages={602–605} } @article{goldman_thorne_jones_1998, title={Assessing the impact of secondary structure and solvent accessibility on protein evolution}, volume={149}, number={1}, journal={Genetics}, author={Goldman, N. and Thorne, J. L. and Jones, D. T.}, year={1998}, pages={445–458} } @article{thorne_kishino_painter_1998, title={Estimating the rate of evolution of the rate of molecular evolution}, volume={15}, ISSN={["0737-4038"]}, DOI={10.1093/oxfordjournals.molbev.a025892}, abstractNote={A simple model for the evolution of the rate of molecular evolution is presented. With a Bayesian approach, this model can serve as the basis for estimating dates of important evolutionary events even in the absence of the assumption of constant rates among evolutionary lineages. The method can be used in conjunction with any of the widely used models for nucleotide substitution or amino acid replacement. It is illustrated by analyzing a data set of rbcL protein sequences.}, number={12}, journal={MOLECULAR BIOLOGY AND EVOLUTION}, author={Thorne, JL and Kishino, H and Painter, IS}, year={1998}, month={Dec}, pages={1647–1657} } @article{lio_goldman_thorne_jones_1998, title={PASSML: combining evolutionary inference and protein secondary structure prediction}, volume={14}, ISSN={["1367-4803"]}, DOI={10.1093/bioinformatics/14.8.726}, abstractNote={Abstract MOTIVATION: Evolutionary models of amino acid sequences can be adapted to incorporate structure information; protein structure biologists can use phylogenetic relationships among species to improve prediction accuracy. Results : A computer program called PASSML ('Phylogeny and Secondary Structure using Maximum Likelihood') has been developed to implement an evolutionary model that combines protein secondary structure and amino acid replacement. The model is related to that of Dayhoff and co-workers, but we distinguish eight categories of structural environment: alpha helix, beta sheet, turn and coil, each further classified according to solvent accessibility, i.e. buried or exposed. The model of sequence evolution for each of the eight categories is a Markov process with discrete states in continuous time, and the organization of structure along protein sequences is described by a hidden Markov model. This paper describes the PASSML software and illustrates how it allows both the reconstruction of phylogenies and prediction of secondary structure from aligned amino acid sequences. AVAILABILITY: PASSML 'ANSI C' source code and the example data sets described here are available at http://ng-dec1.gen.cam.ac.uk/hmm/Passml.html and 'downstream' Web pages. CONTACT: P.Lio@gen.cam.ac.uk}, number={8}, journal={BIOINFORMATICS}, author={Lio, P and Goldman, N and Thorne, JL and Jones, DT}, year={1998}, pages={726–733} }