@article{cardenas-alvarez_restrepo-montoya_bergholz_2022, title={Genome-Wide Association Study of Listeria monocytogenes Isolates Causing Three Different Clinical Outcomes}, url={https://doi.org/10.3390/microorganisms10101934}, DOI={10.3390/microorganisms10101934}, abstractNote={Heterogeneity in virulence potential of L. monocytogenes subgroups have been associated with genetic elements that could provide advantages in certain environments to invade, multiply, and survive within a host. The presence of gene mutations has been found to be related to attenuated phenotypes, while the presence of groups of genes, such as pathogenicity islands (PI), has been associated with hypervirulent or stress-resistant clones. We evaluated 232 whole genome sequences from invasive listeriosis cases in human and ruminants from the US and Europe to identify genomic elements associated with strains causing three clinical outcomes: central nervous system (CNS) infections, maternal-neonatal (MN) infections, and systemic infections (SI). Phylogenetic relationships and virulence-associated genes were evaluated, and a gene-based and single nucleotide polymorphism (SNP)-based genome-wide association study (GWAS) were conducted in order to identify loci associated with the different clinical outcomes. The orthologous results indicated that genes of phage phiX174, transfer RNAs, and type I restriction-modification (RM) system genes along with SNPs in loci involved in environmental adaptation such as rpoB and a phosphotransferase system (PTS) were associated with one or more clinical outcomes. Detection of phenotype-specific candidate loci represents an approach that could narrow the group of genetic elements to be evaluated in future studies.}, journal={Microorganisms}, author={Cardenas-Alvarez, Maria X. and Restrepo-Montoya, Daniel and Bergholz, Teresa M.}, year={2022}, month={Sep} } @article{restrepo-montoya_hulse-kemp_scheffler_haigler_hinze_love_percy_jones_frelichowski_2022, title={Leveraging National Germplasm Collections to Determine Significantly Associated Categorical Traits in Crops: Upland and Pima Cotton as a Case Study}, volume={13}, ISSN={["1664-462X"]}, url={http://dx.doi.org/10.3389/fpls.2022.837038}, DOI={10.3389/fpls.2022.837038}, abstractNote={Observable qualitative traits are relatively stable across environments and are commonly used to evaluate crop genetic diversity. Recently, molecular markers have largely superseded describing phenotypes in diversity surveys. However, qualitative descriptors are useful in cataloging germplasm collections and for describing new germplasm in patents, publications, and/or the Plant Variety Protection (PVP) system. This research focused on the comparative analysis of standardized cotton traits as represented within the National Cotton Germplasm Collection (NCGC). The cotton traits are named by ‘descriptors’ that have non-numerical sub-categories (descriptor states) reflecting the details of how each trait manifests or is absent in the plant. We statistically assessed selected accessions from three major groups ofGossypiumas defined by the NCGC curator: (1) “Stoneville accessions (SA),” containing mainly Upland cotton (Gossypium hirsutum) cultivars; (2) “Texas accessions (TEX),” containing mainlyG. hirsutumlandraces; and (3)Gossypium barbadense(Gb), containing cultivars or landraces of Pima cotton (Gossypium barbadense). For 33 cotton descriptors we: (a) revealed distributions of character states for each descriptor within each group; (b) analyzed bivariate associations between paired descriptors; and (c) clustered accessions based on their descriptors. The fewest significant associations between descriptors occurred in the SA dataset, likely reflecting extensive breeding for cultivar development. In contrast, the TEX and Gb datasets showed a higher number of significant associations between descriptors, likely correlating with less impact from breeding efforts. Three significant bivariate associations were identified for all three groups,bract nectaries:boll nectaries,leaf hair:stem hair, andlint color:seed fuzz color. Unsupervised clustering analysis recapitulated the species labels for about 97% of the accessions. Unexpected clustering results indicated accessions that may benefit from potential further investigation. In the future, the significant associations between standardized descriptors can be used by curators to determine whether new exotic/unusual accessions most closely resemble Upland or Pima cotton. In addition, the study shows how existing descriptors for large germplasm datasets can be useful to inform downstream goals in breeding and research, such as identifying rare individuals with specific trait combinations and targeting breakdown of remaining trait associations through breeding, thus demonstrating the utility of the analytical methods employed in categorizing germplasm diversity within the collection.}, journal={FRONTIERS IN PLANT SCIENCE}, publisher={Frontiers Media SA}, author={Restrepo-Montoya, Daniel and Hulse-Kemp, Amanda M. and Scheffler, Jodi A. and Haigler, Candace H. and Hinze, Lori L. and Love, Janna and Percy, Richard G. and Jones, Don C. and Frelichowski, James}, year={2022}, month={Apr} } @article{restrepo-montoya_mcclean_osorno_2021, title={Orthology and synteny analysis of receptor-like kinases “RLK” and receptor-like proteins “RLP” in legumes}, url={https://doi.org/10.1186/s12864-021-07384-w}, DOI={10.1186/s12864-021-07384-w}, abstractNote={Abstract Background Legume species are an important plant model because of their protein-rich physiology. The adaptability and productivity of legumes are limited by major biotic and abiotic stresses. Responses to these stresses directly involve plasma membrane receptor proteins known as receptor-like kinases and receptor-like proteins. Evaluating the homology relations among RLK and RLP for seven legume species, and exploring their presence among synteny blocks allow an increased understanding of evolutionary relations, physical position, and chromosomal distribution in related species and their shared roles in stress responses. Results Typically, a high proportion of RLK and RLP legume proteins belong to orthologous clusters, which is confirmed in this study, where between 66 to 90% of the RLKs and RLPs per legume species were classified in orthologous clusters. One-third of the evaluated syntenic blocks had shared RLK/RLP genes among both legumes and non-legumes. Among the legumes, between 75 and 98% of the RLK/RLP were present in syntenic blocks. The distribution of chromosomal segments between Phaseolus vulgaris and Vigna unguiculata, two species that diverged ~ 8 mya, were highly similar. Among the RLK/RLP synteny clusters, seven experimentally validated resistance RLK/RLP genes were identified in syntenic blocks. The RLK resistant genes FLS2, BIR2, ERECTA, IOS1, and AtSERK1 from Arabidopsis and SLSERK1 from Solanum lycopersicum were present in different pairwise syntenic blocks among the legume species. Meanwhile, only the LYM1- RLP resistant gene from Arabidopsis shared a syntenic blocks with Glycine max. Conclusions The orthology analysis of the RLK and RLP suggests a dynamic evolution in the legume family, with between 66 to 85% of RLK and 83 to 88% of RLP belonging to orthologous clusters among the species evaluated. In fact, for the 10-species comparison, a lower number of singleton proteins were reported among RLP compared to RLK, suggesting that RLP positions are more physically conserved compared to RLK. The identification of RLK and RLP genes among the synteny blocks in legumes revealed multiple highly conserved syntenic blocks on multiple chromosomes. Additionally, the analysis suggests that P. vulgaris is an appropriate anchor species for comparative genomics among legumes. }, journal={BMC Genomics}, author={Restrepo-Montoya, Daniel and McClean, Phillip E. and Osorno, Juan M.}, year={2021}, month={Feb} } @article{restrepo-montoya_brueggeman_mcclean_osorno_2020, title={Computational identification of receptor-like kinases “RLK” and receptor-like proteins “RLP” in legumes}, volume={21}, url={http://dx.doi.org/10.1186/s12864-020-06844-z}, DOI={10.1186/s12864-020-06844-z}, abstractNote={AbstractBackgroundIn plants, the plasma membrane is enclosed by the cell wall and anchors RLK and RLP proteins, which play a fundamental role in perception of developmental and environmental cues and are crucial in plant development and immunity. These plasma membrane receptors belong to large gene/protein families that are not easily classified computationally. This detailed analysis of these plasma membrane proteins brings a new source of information to the legume genetic, physiology and breeding research communities.ResultsA computational approach to identify and classify RLK and RLP proteins is presented. The strategy was evaluated using experimentally-validated RLK and RLP proteins and was determined to have a sensitivity of over 0.85, a specificity of 1.00, and a Matthews correlation coefficient of 0.91. The computational approach can be used to develop a detailed catalog of plasma membrane receptors (by type and domains) in several legume/crop species. The exclusive domains identified in legumes for RLKs are WaaY, APH Pkinase_C, LRR_2, and EGF, and for RLP are L-lectin LPRY and PAN_4. The RLK-nonRD and RLCK subclasses are also discovered by the methodology. In both classes, less than 20% of the total RLK predicted for each species belong to this class. Among the 10-species evaluated ~ 40% of the proteins in the kinome are RLKs. The exclusive legume domain combinations identified are B-Lectin/PR5K domains inG. max,M. truncatula,V. angularis, andV. unguiculataand a three-domain combination B-lectin/S-locus/WAK inC. cajan,M. truncatula,P. vulgaris,V. angularis. andV. unguiculata.ConclusionsThe analysis suggests that about 2% of the proteins of each genome belong to the RLK family and less than 1% belong to RLP family. Domain diversity combinations are greater for RLKs compared with the RLP proteins and LRR domains, and the dual domain combination LRR/Malectin were the most frequent domain for both groups of plasma membrane receptors among legume and non-legume species. Legumes exclusively show Pkinase extracellular domains, and atypical domain combinations in RLK and RLP compared with the non-legumes evaluated. The computational logic approach is statistically well supported and can be used with the proteomes of other plant species.}, number={1}, journal={BMC Genomics}, publisher={Springer Science and Business Media LLC}, author={Restrepo-Montoya, Daniel and Brueggeman, Robert and McClean, Phillip E. and Osorno, Juan M.}, year={2020}, month={Jul} } @article{comparative genomics identifies potential virulence factors in clostridium tertium and c. paraputrificum_2019, url={http://dx.doi.org/10.1080/21505594.2019.1637699}, DOI={10.1080/21505594.2019.1637699}, abstractNote={ABSTRACT Some well-known Clostridiales species such as Clostridium difficile and C. perfringens are agents of high impact diseases worldwide. Nevertheless, other foreseen Clostridiales species have recently emerged such as Clostridium tertium and C. paraputrificum. Three fecal isolates were identified as Clostridium tertium (Gcol.A2 and Gcol.A43) and C. paraputrificum (Gcol.A11) during public health screening for C. difficile infections in Colombia. C. paraputrificum genomes were highly diverse and contained large numbers of accessory genes. Genetic diversity and accessory gene percentage were lower among the C. tertium genomes than in the C. paraputrificum genomes. C. difficile tcdA and tcdB toxins encoding homologous sequences and other potential virulence factors were also identified. EndoA interferase, a toxic component of the type II toxin-antitoxin system, was found among the C. tertium genomes. toxA was the only toxin encoding gene detected in Gcol.A43, the Colombian isolate with an experimentally-determined high cytotoxic effect. Gcol.A2 and Gcol.A43 had higher sporulation efficiencies than Gcol.A11 (84.5%, 83.8% and 57.0%, respectively), as supported by the greater number of proteins associated with sporulation pathways in the C. tertium genomes compared with the C. paraputrificum genomes (33.3 and 28.4 on average, respectively). This work allowed complete genome description of two clostridiales species revealing high levels of intra-taxa diversity, accessory genomes containing virulence-factors encoding genes (especially in C. paraputrificum), with proteins involved in sporulation processes more highly represented in C. tertium. These finding suggest the need to advance in the study of those species with potential importance at public health level.}, journal={Virulence}, year={2019}, month={Jul} } @article{integrated genomic epidemiology and phenotypic profiling of clostridium difficile across intra-hospital and community populations in colombia_2019, url={http://dx.doi.org/10.1038/s41598-019-47688-2}, DOI={10.1038/s41598-019-47688-2}, abstractNote={AbstractClostridium difficile, the causal agent of antibiotic-associated diarrhea, has a complex epidemiology poorly studied in Latin America. We performed a robust genomic and phenotypic profiling of 53 C. difficile clinical isolates established from diarrheal samples from either intrahospital (IH) or community (CO) populations in central Colombia. In vitro tests were conducted to evaluate the cytopathic effect, the minimum inhibitory concentration of ten antimicrobial agents, the sporulation efficiency and the colony forming ability. Eleven different sequence types (STs) were found, the majority present individually in each sample, however in three samples two different STs were isolated. Interestingly, CO patients were infected with STs associated with hypervirulent strains (ST-1 in Clade-2). Three coexistence events (two STs simultaneously detected in the same sample) were observed always involving ST-8 from Clade-1. A total of 2,502 genes were present in 99% of the isolates with 95% of identity or more, it represents a core genome of 28.6% of the 8,735 total genes identified in the set of genomes. A high cytopathic effect was observed for the isolates positive for the two main toxins but negative for binary toxin (TcdA+/TcdB+/CDT− toxin production type), found only in Clade-1. Molecular markers conferring resistance to fluoroquinolones (cdeA and gyrA) and to sulfonamides (folP) were the most frequent in the analyzed genomes. In addition, 15 other markers were found mostly in Clade-2 isolates. These results highlight the regional differences that C. difficile isolates display, being in this case the CO isolates the ones having a greater number of accessory genes and virulence-associated factors.}, journal={Scientific Reports}, year={2019}, month={Aug} } @article{genetic architecture of flooding tolerance in the dry bean middle-american diversity panel_2017, url={http://dx.doi.org/10.3389/fpls.2017.01183}, DOI={10.3389/fpls.2017.01183}, abstractNote={Flooding is a devastating abiotic stress that endangers crop production in the twenty-first century. Because of the severe susceptibility of common bean (Phaseolus vulgaris L.) to flooding, an understanding of the genetic architecture and physiological responses of this crop will set the stage for further improvement. However, challenging phenotyping methods hinder a large-scale genetic study of flooding tolerance in common bean and other economically important crops. A greenhouse phenotyping protocol was developed to evaluate the flooding conditions at early stages. The Middle-American diversity panel (n = 272) of common bean was developed to capture most of the diversity exits in North American germplasm. This panel was evaluated for seven traits under both flooded and non-flooded conditions at two early developmental stages. A subset of contrasting genotypes was further evaluated in the field to assess the relationship between greenhouse and field data under flooding condition. A genome-wide association study using ~150 K SNPs was performed to discover genomic regions associated with multiple physiological responses. The results indicate a significant strong correlation (r > 0.77) between greenhouse and field data, highlighting the reliability of greenhouse phenotyping method. Black and small red beans were the least affected by excess water at germination stage. At the seedling stage, pinto and great northern genotypes were the most tolerant. Root weight reduction due to flooding was greatest in pink and small red cultivars. Flooding reduced the chlorophyll content to the greatest extent in the navy bean cultivars compared with other market classes. Races of Durango/Jalisco and Mesoamerica were separated by both genotypic and phenotypic data indicating the potential effect of eco-geographical variations. Furthermore, several loci were identified that potentially represent the antagonistic pleiotropy. The GWAS analysis revealed peaks at Pv08/1.6 Mb and Pv02/41 Mb that are associated with root weight and germination rate, respectively. These regions are syntenic with two QTL reported in soybean (Glycine max L.) that contribute to flooding tolerance, suggesting a conserved evolutionary pathway involved in flooding tolerance for these related legumes.}, journal={Frontiers in Plant Science}, year={2017}, month={Jul} } @article{changes in macrophage gene expression associated with leishmania (viannia) braziliensis infection_2015, url={http://dx.doi.org/10.1371/journal.pone.0128934}, DOI={10.1371/journal.pone.0128934}, abstractNote={Different Leishmania species cause distinct clinical manifestations of the infectious disease leishmaniasis. It is fundamentally important to understand the mechanisms governing the interaction between Leishmania and its host cell. Little is known about this interaction between Leishmania (Viannia) braziliensis and human macrophages. In this study, we aimed to identify differential gene expression between non-infected and L. (V) braziliensis-infected U937-derived macrophages. We deployed a whole human transcriptome microarray analysis using 72 hours post-infection samples and compared those samples with their non-infected counterparts. We found that 218 genes were differentially expressed between infected and non-infected macrophages. A total of 71.6% of these genes were down-regulated in the infected macrophages. Functional enrichment analyses identified the steroid and sterol/cholesterol biosynthetic processes between regulatory networks down-regulated in infected macrophages. RT-qPCR further confirmed this down-regulation in genes belonging to these pathways. These findings contrast with those from studies involving other Leishmania species at earlier infection stages, where gene up-regulation for this metabolic pathway has been reported. Sterol biosynthesis could be an important biological process associated with the expression profile of macrophages infected by L. (V.) braziliensis. Differential transcriptional results suggest a negative regulation of the genetic regulatory network involved in cholesterol biosynthesis.}, journal={PLOS ONE}, year={2015}, month={Jun} } @inproceedings{a multi-objective optimization energy approach to predict the ligand conformation in a docking process_2013, url={http://dx.doi.org/10.1007/978-3-642-37207-0_16}, DOI={10.1007/978-3-642-37207-0_16}, abstractNote={This work proposes a multi-objective algorithmic method for modelling the prediction of the conformation and configuration of ligands in receptor-ligand complexes by considering energy contributions of molecular interactions. The proposed approach is an improvement over others in the field, where the principle insight is that a Pareto front helps to understand the tradeoffs in the actual problem. The method is based on three main features: (i) Representation of molecular data using a trigonometric model; (ii) Modelling of molecular interactions with all-atoms force field energy functions and (iii) Exploration of the conformational space through a multi-objective evolutionary algorithm. The performance of the proposed model was evaluated and validated over a set of well known complexes. The method showed a promising performance when predicting ligands with high number of rotatable bonds.}, booktitle={Lecture Notes in Computer Science}, year={2013} } @article{the autoimmune tautology: an in silico approach_2012, url={http://dx.doi.org/10.1155/2012/792106}, DOI={10.1155/2012/792106}, abstractNote={There is genetic evidence of similarities and differences among autoimmune diseases (AIDs) that warrants looking at a general panorama of what has been published. Thus, our aim was to determine the main shared genes and to what extent they contribute to building clusters of AIDs. We combined a text-mining approach to build clusters of genetic concept profiles (GCPs) from the literature in MedLine with knowledge of protein-protein interactions to confirm if genes in GCP encode proteins that truly interact. We found three clusters in which the genes with the highest contribution encoded proteins that showed strong and specific interactions. After projecting the AIDs on a plane, two clusters could be discerned: Sjögren’s syndrome—systemic lupus erythematosus, and autoimmune thyroid disease—type1 diabetes—rheumatoid arthritis. Our results support the common origin of AIDs and the role of genes involved in apoptosis such asCTLA4,FASLG,andIL10.}, journal={Autoimmune Diseases}, year={2012}, month={Mar} } @article{identification of plasmodium vivax proteins with potential role in invasion using sequence redundancy reduction and profile hidden markov models_2011, url={http://dx.doi.org/10.1371/journal.pone.0025189}, DOI={10.1371/journal.pone.0025189}, abstractNote={Background This study describes a bioinformatics approach designed to identify Plasmodium vivax proteins potentially involved in reticulocyte invasion. Specifically, different protein training sets were built and tuned based on different biological parameters, such as experimental evidence of secretion and/or involvement in invasion-related processes. A profile-based sequence method supported by hidden Markov models (HMMs) was then used to build classifiers to search for biologically-related proteins. The transcriptional profile of the P. vivax intra-erythrocyte developmental cycle was then screened using these classifiers. Results A bioinformatics methodology for identifying potentially secreted P. vivax proteins was designed using sequence redundancy reduction and probabilistic profiles. This methodology led to identifying a set of 45 proteins that are potentially secreted during the P. vivax intra-erythrocyte development cycle and could be involved in cell invasion. Thirteen of the 45 proteins have already been described as vaccine candidates; there is experimental evidence of protein expression for 7 of the 32 remaining ones, while no previous studies of expression, function or immunology have been carried out for the additional 25. Conclusions The results support the idea that probabilistic techniques like profile HMMs improve similarity searches. Also, different adjustments such as sequence redundancy reduction using Pisces or Cd-Hit allowed data clustering based on rational reproducible measurements. This kind of approach for selecting proteins with specific functions is highly important for supporting large-scale analyses that could aid in the identification of genes encoding potential new target antigens for vaccine development and drug design. The present study has led to targeting 32 proteins for further testing regarding their ability to induce protective immune responses against P. vivax malaria.}, journal={PLoS ONE}, year={2011}, month={Oct} } @article{nclassg+: a classifier for non-classically secreted gram-positive bacterial proteins_2011, url={http://dx.doi.org/10.1186/1471-2105-12-21}, DOI={10.1186/1471-2105-12-21}, abstractNote={AbstractBackgroundMost predictive methods currently available for the identification of protein secretion mechanisms have focused on classically secreted proteins. In fact, only two methods have been reported for predicting non-classically secreted proteins of Gram-positive bacteria. This study describes the implementation of a sequence-based classifier, denoted as NClassG+, for identifying non-classically secreted Gram-positive bacterial proteins.ResultsSeveral feature-based classifiers were trained using different sequence transformation vectors (frequencies, dipeptides, physicochemical factors and PSSM) and Support Vector Machines (SVMs) with Linear, Polynomial and Gaussian kernel functions. Nestedk-fold cross-validation (CV) was applied to select the best models, using the inner CV loop to tune the model parameters and the outer CV group to compute the error. The parameters and Kernel functions and the combinations between all possible feature vectors were optimized using grid search.ConclusionsThe final model was tested against an independent set not previously seen by the model, obtaining better predictive performance compared to SecretomeP V2.0 and SecretPV2.0 for the identification of non-classically secreted proteins. NClassG+ is freely available on the web athttp://www.biolisi.unal.edu.co/web-servers/nclassgpositive/}, journal={BMC Bioinformatics}, year={2011}, month={Jan} } @inproceedings{a parallel multi-objective ab initio approach for protein structure prediction_2010, url={http://dx.doi.org/10.1109/bibm.2010.5706552}, DOI={10.1109/bibm.2010.5706552}, abstractNote={Protein structure prediction is one of the most important problems in bioinformatics and structural biology. This work proposes a novel and suitable methodology to model protein structure prediction with atomic-level detail by using a parallel multi-objective ab initio approach. In the proposed model, i) A trigonometric representation is used to compute backbone and side-chain torsion angles of protein atoms; ii) The Chemistry at HARvard Macromolecular Mechanics (CHARMm) function optimizes and evaluates the structures of the protein conformations; iii) The evolution of protein conformations is directed by optimization of protein energy contributions using the multi-objective genetic algorithm NSGA-II; and iv) The computation process is sped up and its effectiveness improved through the implementation of an island model of the evolutionary algorithm. The proposed model was validated on a set of benchmark proteins obtaining very promising results.}, booktitle={2010 IEEE International Conference on Bioinformatics and Biomedicine (BIBM)}, year={2010}, month={Dec} } @article{computational prediction and experimental assessment of secreted/surface proteins from mycobacterium tuberculosis h37rv_2010, url={http://dx.doi.org/10.1371/journal.pcbi.1000824}, DOI={10.1371/journal.pcbi.1000824}, abstractNote={The mycobacterial cell envelope has been implicated in the pathogenicity of tuberculosis and therefore has been a prime target for the identification and characterization of surface proteins with potential application in drug and vaccine development. In this study, the genome of Mycobacterium tuberculosis H37Rv was screened using Machine Learning tools that included feature-based predictors, general localizers and transmembrane topology predictors to identify proteins that are potentially secreted to the surface of M. tuberculosis, or to the extracellular milieu through different secretory pathways. The subcellular localization of a set of 8 hypothetically secreted/surface candidate proteins was experimentally assessed by cellular fractionation and immunoelectron microscopy (IEM) to determine the reliability of the computational methodology proposed here, using 4 secreted/surface proteins with experimental confirmation as positive controls and 2 cytoplasmic proteins as negative controls. Subcellular fractionation and IEM studies provided evidence that the candidate proteins Rv0403c, Rv3630, Rv1022, Rv0835, Rv0361 and Rv0178 are secreted either to the mycobacterial surface or to the extracellular milieu. Surface localization was also confirmed for the positive controls, whereas negative controls were located on the cytoplasm. Based on statistical learning methods, we obtained computational subcellular localization predictions that were experimentally assessed and allowed us to construct a computational protocol with experimental support that allowed us to identify a new set of secreted/surface proteins as potential vaccine candidates.}, journal={PLoS Computational Biology}, year={2010}, month={Jun} } @article{validating subcellular localization prediction tools with mycobacterial proteins_2009, url={http://dx.doi.org/10.1186/1471-2105-10-134}, DOI={10.1186/1471-2105-10-134}, abstractNote={Abstract Background The computational prediction of mycobacterial proteins' subcellular localization is of key importance for proteome annotation and for the identification of new drug targets and vaccine candidates. Several subcellular localization classifiers have been developed over the past few years, which have comprised both general localization and feature-based classifiers. Here, we have validated the ability of different bioinformatics approaches, through the use of SignalP 2.0, TatP 1.0, LipoP 1.0, Phobius, PA-SUB 2.5, PSORTb v.2.0.4 and Gpos-PLoc, to predict secreted bacterial proteins. These computational tools were compared in terms of sensitivity, specificity and Matthew's correlation coefficient (MCC) using a set of mycobacterial proteins having less than 40% identity, none of which are included in the training data sets of the validated tools and whose subcellular localization have been experimentally confirmed. These proteins belong to the TBpred training data set, a computational tool specifically designed to predict mycobacterial proteins. Results A final validation set of 272 mycobacterial proteins was obtained from the initial set of 852 mycobacterial proteins. According to the results of the validation metrics, all tools presented specificity above 0.90, while dispersion sensitivity and MCC values were above 0.22. PA-SUB 2.5 presented the highest values; however, these results might be biased due to the methodology used by this tool. PSORTb v.2.0.4 left 56 proteins out of the classification, while Gpos-PLoc left just one protein out. Conclusion Both subcellular localization approaches had high predictive specificity and high recognition of true negatives for the tested data set. Among those tools whose predictions are not based on homology searches against SWISS-PROT, Gpos-PLoc was the general localization tool with the best predictive performance, while SignalP 2.0 was the best tool among the ones using a feature-based approach. Even though PA-SUB 2.5 presented the highest metrics, it should be taken into account that this tool was trained using all proteins reported in SWISS-PROT, which includes the protein set tested in this study, either as a BLAST search or as a training model. }, journal={BMC Bioinformatics}, year={2009}, month={May} }