@article{atchley_zhao_2007, title={Molecular architecture of the DNA-binding region and its relationship to classification of basic helix-loop-helix proteins}, volume={24}, ISSN={["1537-1719"]}, DOI={10.1093/molbev/msl143}, abstractNote={Multivariate statistical analyses are used to explore the molecular architecture of the DNA-binding and dimerization regions of basic helix-loop-helix (bHLH) proteins. Alphabetic amino acid data are transformed to biologically meaningful quantitative values using a set of 5 multivariate "indices." These multivariate indices summarize variation in a large suite of amino acid physiochemical attributes and reflect variability in polarity-accessibility-hydrophobicity, propensity for secondary structure, molecular size, codon composition, and electrostatic charge. Using these index score data, discriminant analyses describe the multidimensional aspects of physiochemical variation and clarify the structural basis of the prevailing evolutionary classification of bHLH proteins. A small number of amino acids from both the binding dimerization domains, when considered simultaneously, accurately distinguish the 5 known DNA-binding groups. The relevant sites often have well-documented structural and functional characteristics.}, number={1}, journal={MOLECULAR BIOLOGY AND EVOLUTION}, author={Atchley, William R. and Zhao, Jieping}, year={2007}, month={Jan}, pages={192–202} } @article{atchley_zhao_fernandes_druke_2005, title={Solving the protein sequence metric problem}, volume={102}, ISSN={["1091-6490"]}, DOI={10.1073/pnas.0408677102}, abstractNote={Biological sequences are composed of long strings of alphabetic letters rather than arrays of numerical values. Lack of a natural underlying metric for comparing such alphabetic data significantly inhibits sophisticated statistical analyses of sequences, modeling structural and functional aspects of proteins, and related problems. Herein, we use multivariate statistical analyses on almost 500 amino acid attributes to produce a small set of highly interpretable numeric patterns of amino acid variability. These high-dimensional attribute data are summarized by five multidimensional patterns of attribute covariation that reflect polarity, secondary structure, molecular volume, codon diversity, and electrostatic charge. Numerical scores for each amino acid then transform amino acid sequences for statistical analyses. Relationships between transformed data and amino acid substitution matrices show significant associations for polarity and codon diversity scores. Transformed alphabetic data are used in analysis of variance and discriminant analysis to study DNA binding in the basic helix-loop-helix proteins. The transformed scores offer a general solution for analyzing a wide variety of sequence analysis problems.}, number={18}, journal={PROCEEDINGS OF THE NATIONAL ACADEMY OF SCIENCES OF THE UNITED STATES OF AMERICA}, author={Atchley, WR and Zhao, JP and Fernandes, AD and Druke, T}, year={2005}, month={May}, pages={6395–6400} }