@article{wimberley_heber_2020, title={PeakPass: Automating ChIP-Seq Blacklist Creation}, volume={27}, ISSN={["1557-8666"]}, DOI={10.1089/cmb.2019.0295}, abstractNote={ChIP-Seq blacklists contain genomic regions that frequently produce artifacts and noise in ChIP-Seq experiments. To improve signal-to-noise ratio, ChIP-Seq pipelines often remove data points that map to blacklist regions. Existing blacklists have been compiled in a manual or semiautomated way. In this article we describe PeakPass, an efficient method to generate blacklists, and demonstrate that blacklists can increase ChIP-Seq data quality. PeakPass leverages machine learning and attempts to automate blacklist generation. PeakPass uses a random forest classifier in combination with genomic features such as sequence, annotated repeats, complexity, assembly gaps, and the ratio of multimapping to uniquely mapping reads to identify artifact regions. We have validated PeakPass on a large data set and tested it for the purpose of upgrading a blacklist to a new reference genome version. We trained PeakPass on the ENCODE blacklist for the hg19 human reference genome, and created an updated blacklist for hg38. To assess the performance of this blacklist, we tested 42 ChIP-Seq replicates from 24 experiments using 10 ChIP-Seq quality metrics including relative strand coefficient, standardized standard deviation, and enrichment of reads in promoter regions. Using the blacklist generated by PeakPass resulted in a statistically significant improvement for nine of these metrics.}, number={2}, journal={JOURNAL OF COMPUTATIONAL BIOLOGY}, author={Wimberley, Charles E. and Heber, Steffen}, year={2020}, month={Feb}, pages={259–268} } @article{wimberley_heber_2019, title={PeakPass: Automating ChIP-Seq Blacklist Creation}, volume={11490}, ISBN={["978-3-030-20241-5"]}, ISSN={["1611-3349"]}, DOI={10.1007/978-3-030-20242-2_20}, abstractNote={ChIP-Seq blacklists contain genomic regions that frequently produce artifacts and noise in ChIP-Seq experiments. To improve signal-to-noise ratio, ChIP-Seq pipelines often remove data points that map to blacklist regions. Existing blacklists have been compiled in a manual or semi-automated way. In this paper we describe PeakPass, an efficient method to generate blacklists, and present evidence that blacklists can increase ChIP-Seq data quality. PeakPass leverages machine learning and attempts to automate blacklist generation. PeakPass uses a random forest classifier in combination with genomic features such as sequence, annotated repeats, complexity, assembly gaps, and the ratio of multi-mapping to uniquely mapping reads to identify artifact regions. We have validated PeakPass on a large dataset and tested it for the purpose of upgrading a blacklist to a new reference genome version. We trained PeakPass on the ENCODE blacklist for the hg19 human reference genome, and created an updated blacklist for hg38. To assess the performance of this blacklist we tested 42 ChIP-Seq replicates from 24 experiments using the Relative Strand Correlation (RSC) metric as a quality measure. Using the blacklist generated by PeakPass resulted in a statistically significant increase in RSC over the existing ENCODE blacklist for hg38 – average RSC was increased by 50% over the ENCODE blacklist, while only filtering an average of 0.1% of called peaks.}, journal={BIOINFORMATICS RESEARCH AND APPLICATIONS, ISBRA 2019}, author={Wimberley, Charles E. and Heber, Steffen}, year={2019}, pages={232–243} } @article{perkins_mazzoni-putman_stepanova_alonso_heber_2019, title={RiboStreamR: a web application for quality control, analysis, and visualization of Ribo-seq data}, volume={20}, ISSN={["1471-2164"]}, DOI={10.1186/s12864-019-5700-7}, abstractNote={Ribo-seq is a popular technique for studying translation and its regulation. A Ribo-seq experiment produces a snap-shot of the location and abundance of actively translating ribosomes within a cell's transcriptome. In practice, Ribo-seq data analysis can be sensitive to quality issues such as read length variation, low read periodicities, and contaminations with ribosomal and transfer RNA. Various software tools for data preprocessing, quality assessment, analysis, and visualization of Ribo-seq data have been developed. However, many of these tools require considerable practical knowledge of software applications, and often multiple different tools have to be used in combination with each other.We present riboStreamR, a comprehensive Ribo-seq quality control (QC) platform in the form of an R Shiny web application. RiboStreamR provides visualization and analysis tools for various Ribo-seq QC metrics, including read length distribution, read periodicity, and translational efficiency. Our platform is focused on providing a user-friendly experience, and includes various options for graphical customization, report generation, and anomaly detection within Ribo-seq datasets.RiboStreamR takes advantage of the vast resources provided by the R and Bioconductor environments, and utilizes the Shiny R package to ensure a high level of usability. Our goal is to develop a tool which facilitates in-depth quality assessment of Ribo-seq data by providing reference datasets and automatically highlighting quality issues and anomalies within datasets.}, journal={BMC GENOMICS}, author={Perkins, Patrick and Mazzoni-Putman, Serina and Stepanova, Anna and Alonso, Jose and Heber, Steffen}, year={2019}, month={Jun} } @article{tokarz_heffelfinger_jima_gerlach_shah_rodriguez-nunez_kortum_fletcher_nordone_law_et al._2017, title={Disruption of Trim9 function abrogates macrophage motility in vivo}, volume={102}, ISSN={0741-5400 1938-3673}, url={http://dx.doi.org/10.1189/jlb.1A0816-371R}, DOI={10.1189/jlb.1a0816-371r}, abstractNote={Abstract The vertebrate immune response comprises multiple molecular and cellular components that interface to provide defense against pathogens. Because of the dynamic complexity of the immune system and its interdependent innate and adaptive functionality, an understanding of the whole-organism response to pathogen exposure remains unresolved. Zebrafish larvae provide a unique model for overcoming this obstacle, because larvae are protected against pathogens while lacking a functional adaptive immune system during the first few weeks of life. Zebrafish larvae were exposed to immune agonists for various lengths of time, and a microarray transcriptome analysis was executed. This strategy identified known immune response genes, as well as genes with unknown immune function, including the E3 ubiquitin ligase tripartite motif-9 (Trim9). Although trim9 expression was originally described as “brain specific,” its expression has been reported in stimulated human Mϕs. In this study, we found elevated levels of trim9 transcripts in vivo in zebrafish Mϕs after immune stimulation. Trim9 has been implicated in axonal migration, and we therefore investigated the impact of Trim9 disruption on Mϕ motility and found that Mϕ chemotaxis and cellular architecture are subsequently impaired in vivo. These results demonstrate that Trim9 mediates cellular movement and migration in Mϕs as well as neurons.}, number={6}, journal={Journal of Leukocyte Biology}, publisher={Wiley}, author={Tokarz, Debra A. and Heffelfinger, Amy K. and Jima, Dereje D. and Gerlach, Jamie and Shah, Radhika N. and Rodriguez-Nunez, Ivan and Kortum, Amanda N. and Fletcher, Ashley A. and Nordone, Shila K. and Law, J. McHugh and et al.}, year={2017}, month={Oct}, pages={1371–1380} } @inproceedings{perkins_heber_2017, title={riboStreamR: A web application for quality control, analysis, and visualization of Ribo-seq data}, DOI={10.1109/iccabs.2017.8114317}, abstractNote={Ribo-seq is a popular technique for studying translation and its regulation. Various software tools for data preprocessing, quality assessment, analysis, and visualization of Ribo-seq data have been developed. However, many of them are inaccessible to users without a thorough practical knowledge of software applications, and often multiple different tools have to be used in combination with each other. Here, we present riboStreamR, a comprehensive Ribo-seq quality control (QC) platform in the form of an R Shiny web application. RiboStreamR provides visualization and analysis tools for various Ribo-seq QC metrics, including read length distribution, read periodicity, and translational efficiency. The platform's environment is centered on providing a user-friendly experience, and includes numerous options for graphical customization and report generation. In practice, Ribo-seq data analysis can be sensitive to data quality issues such as read length variation, low read periodicities, and contaminations with ribosomal and transfer RNA. What constitutes ‘high quality’ data is often unclear. Our goal is to develop novel functionality to automatically highlight quality issues and anomalies in the data. This NSF-supported project is performed in collaboration with Jose Alonso, Anna Stepanova, Serina Mazzoni-Putman, and Cranos Williams.}, booktitle={International conference on computational advances in bio and medical}, author={Perkins, P. and Heber, Steffen}, year={2017} } @article{hu_merchante_stepanova_alonso_heber_2016, title={Genome-wide search for translated upstream open reading frames in Arabidopsis thaliana}, volume={15}, number={2}, journal={IEEE Transactions on Nanobioscience}, author={Hu, Q. W. and Merchante, C. and Stepanova, A. N. and Alonso, J. M. and Heber, S.}, year={2016}, pages={150–159} } @article{villarino_hu_manrique_flores-vergara_sehra_robles_brumos_stepanova_colombo_sundberg_et al._2016, title={Transcriptomic Signature of the SHATTERPROOF2 Expression Domain Reveals the Meristematic Nature of Arabidopsis Gynoecial Medial Domain}, volume={171}, ISSN={["1532-2548"]}, url={http://europepmc.org/abstract/med/26983993}, DOI={10.1104/pp.15.01845}, abstractNote={Transcriptional profiles of spatially and temporally restricted cell populations from the Arabidopsis gynoecium reveals the meristematic nature of the gynoecial medial domain. Plant meristems, like animal stem cell niches, maintain a pool of multipotent, undifferentiated cells that divide and differentiate to give rise to organs. In Arabidopsis (Arabidopsis thaliana), the carpel margin meristem is a vital meristematic structure that generates ovules from the medial domain of the gynoecium, the female floral reproductive structure. The molecular mechanisms that specify this meristematic region and regulate its organogenic potential are poorly understood. Here, we present a novel approach to analyze the transcriptional signature of the medial domain of the Arabidopsis gynoecium, highlighting the developmental stages that immediately proceed ovule initiation, the earliest stages of seed development. Using a floral synchronization system and a SHATTERPROOF2 (SHP2) domain-specific reporter, paired with FACS and RNA sequencing, we assayed the transcriptome of the gynoecial medial domain with temporal and spatial precision. This analysis reveals a set of genes that are differentially expressed within the SHP2 expression domain, including genes that have been shown previously to function during the development of medial domain-derived structures, including the ovules, thus validating our approach. Global analyses of the transcriptomic data set indicate a similarity of the pSHP2-expressing cell population to previously characterized meristematic domains, further supporting the meristematic nature of this gynoecial tissue. Our method identifies additional genes including novel isoforms, cis-natural antisense transcripts, and a previously unrecognized member of the REPRODUCTIVE MERISTEM family of transcriptional regulators that are potential novel regulators of medial domain development. This data set provides genome-wide transcriptional insight into the development of the carpel margin meristem in Arabidopsis.}, number={1}, journal={PLANT PHYSIOLOGY}, author={Villarino, Gonzalo H. and Hu, Qiwen and Manrique, Silvia and Flores-Vergara, Miguel and Sehra, Bhupinder and Robles, Linda and Brumos, Javier and Stepanova, Anna N. and Colombo, Lucia and Sundberg, Eva and et al.}, year={2016}, month={May}, pages={42–61} } @inbook{hu_merchante_stepanova_alonso_heber_2015, title={A Stacking-Based Approach to Identify Translated Upstream Open Reading Frames in Arabidopsis Thaliana}, volume={9096}, ISBN={9783319190471 9783319190488}, ISSN={0302-9743 1611-3349}, url={http://dx.doi.org/10.1007/978-3-319-19048-8_12}, DOI={10.1007/978-3-319-19048-8_12}, abstractNote={Upstream open reading frames (uORFs) are open reading frames located within the 5’ UTR of an mRNA. It is believed that translated uORFs reduce the translational efficiency of the main coding region, and play an important role in gene regulation. However, only few uORFs are experimentally characterized. In this paper, we use ribosome footprinting together with a stacking-based classification approach to identify translated uORFs in Arabidopsis thaliana. Our approach resulted in a set of 5360 potentially translated uORFs in 2051 genes. GO terms enriched in uORF-containing genes include gene regulation, signal transduction and metabolic pathway. The identified uORFs occur with a higher frequency in multi-isoform genes, and many uORFs are affected by alternative transcript start sites or alternative splicing events.}, booktitle={Bioinformatics Research and Applications}, publisher={Springer International Publishing}, author={Hu, Qiwen and Merchante, Catharina and Stepanova, Anna N. and Alonso, Jose M. and Heber, Steffen}, year={2015}, pages={138–149} } @inproceedings{hu_merchante_stepanova_alonso_heber_2015, title={A stacking-based approach to identify translated upstream open reading frames in Arabidopsis thaliana}, volume={9096}, booktitle={Bioinformatics research and applications (isbra 2015)}, author={Hu, Q. W. and Merchante, C. and Stepanova, A. N. and Alonso, J. M. and Heber, S.}, year={2015}, pages={138–149} } @article{merchante_brumos_yun_hu_spencer_enriquez_binder_heber_stepanova_alonso_2015, title={Gene-Specific Translation Regulation Mediated by the Hormone-Signaling Molecule EIN2}, volume={163}, ISSN={["1097-4172"]}, url={http://www.scopus.com/inward/record.url?eid=2-s2.0-84948814371&partnerID=MN8TOARS}, DOI={10.1016/j.cell.2015.09.036}, abstractNote={The central role of translation in modulating gene activity has long been recognized, yet the systematic exploration of quantitative changes in translation at a genome-wide scale in response to a specific stimulus has only recently become technically feasible. Using the well-characterized signaling pathway of the phytohormone ethylene and plant-optimized genome-wide ribosome footprinting, we have uncovered a molecular mechanism linking this hormone’s perception to the activation of a gene-specific translational control mechanism. Characterization of one of the targets of this translation regulatory machinery, the ethylene signaling component EBF2, indicates that the signaling molecule EIN2 and the nonsense-mediated decay proteins UPFs play a central role in this ethylene-induced translational response. Furthermore, the 3′UTR of EBF2 is sufficient to confer translational regulation and required for the proper activation of ethylene responses. These findings represent a mechanistic paradigm of gene-specific regulation of translation in response to a key growth regulator.}, number={3}, journal={CELL}, author={Merchante, Catharina and Brumos, Javier and Yun, Jeonga and Hu, Qiwen and Spencer, Kristina R. and Enriquez, Paul and Binder, Brad M. and Heber, Steffen and Stepanova, Anna N. and Alonso, Jose M.}, year={2015}, month={Oct}, pages={684–697} } @inproceedings{hu_merchante_stepanova_alonso_heber_2015, title={Mining transcript features related to translation in Arabidopsis using LASSO and random forest}, url={http://www.scopus.com/inward/record.url?eid=2-s2.0-84960900444&partnerID=MN8TOARS}, DOI={10.1109/iccabs.2015.7344713}, abstractNote={Translation is an important process for all living organisms. During translation, messenger RNA is rewritten into protein. Multiple control mechanisms determine how much protein is generated during translation. In particular, several regulatory elements located on mRNA transcripts are known to affect translation. In this study, a genome-wide analysis was performed to mine features related to translation in the genome of Arabidopsis thaliana. We used ribosome footprinting data to measure translation and constructed a predictive model using LASSO and random forest to select features that likely affect translation. We identified multiple transcript features and measured their influence on translation in different transcript regions. We found that features related to different translation stages may have a different impact on translation; often, features relevant to the elongation step were playing a stronger role. Interestingly, we found that the contribution of features may be different for transcripts belonging to different functional groups, suggesting that transcripts might employ different mechanisms for the regulation of translation.}, booktitle={International conference on computational advances in bio and medical}, author={Hu, Q. W. and Merchante, C. and STEPANOVA, ANNA and Alonso, Jose and Heber, Steffen}, year={2015} } @article{schreiner_nguyen_russo_heber_patrignani_ahrne_scheiffele_2014, title={Targeted Combinatorial Alternative Splicing Generates Brain Region-Specific Repertoires of Neurexins}, volume={84}, ISSN={["1097-4199"]}, DOI={10.1016/j.neuron.2014.09.011}, abstractNote={Molecular diversity of surface receptors has been hypothesized to provide a mechanism for selective synaptic connectivity. Neurexins are highly diversified receptors that drive the morphological and functional differentiation of synapses. Using a single cDNA sequencing approach, we detected 1,364 unique neurexin-α and 37 neurexin-β mRNAs produced by alternative splicing of neurexin pre-mRNAs. This molecular diversity results from near-exhaustive combinatorial use of alternative splice insertions in Nrxn1α and Nrxn2α. By contrast, Nrxn3α exhibits several highly stereotyped exon selections that incorporate novel elements for posttranscriptional regulation of a subset of transcripts. Complexity of Nrxn1α repertoires correlates with the cellular complexity of neuronal tissues, and a specific subset of isoforms is enriched in a purified cell type. Our analysis defines the molecular diversity of a critical synaptic receptor and provides evidence that neurexin diversity is linked to cellular diversity in the nervous system.}, number={2}, journal={NEURON}, author={Schreiner, Dietmar and Nguyen, Thi-Minh and Russo, Giancarlo and Heber, Steffen and Patrignani, Andrea and Ahrne, Erik and Scheiffele, Peter}, year={2014}, month={Oct}, pages={386–398} } @article{howard_hu_babaoglu_chandra_borghi_tan_he_winter-sederoff_gassmann_veronese_et al._2013, title={High-Throughput RNA Sequencing of Pseudomonas-Infected Arabidopsis Reveals Hidden Transcriptome Complexity and Novel Splice Variants}, volume={8}, ISSN={["1932-6203"]}, url={http://www.scopus.com/inward/record.url?eid=2-s2.0-84885077606&partnerID=MN8TOARS}, DOI={10.1371/journal.pone.0074183}, abstractNote={We report the results of a genome-wide analysis of transcription in Arabidopsis thaliana after treatment with Pseudomonas syringae pathovar tomato. Our time course RNA-Seq experiment uses over 500 million read pairs to provide a detailed characterization of the response to infection in both susceptible and resistant hosts. The set of observed differentially expressed genes is consistent with previous studies, confirming and extending existing findings about genes likely to play an important role in the defense response to Pseudomonas syringae. The high coverage of the Arabidopsis transcriptome resulted in the discovery of a surprisingly large number of alternative splicing (AS) events – more than 44% of multi-exon genes showed evidence for novel AS in at least one of the probed conditions. This demonstrates that the Arabidopsis transcriptome annotation is still highly incomplete, and that AS events are more abundant than expected. To further refine our predictions, we identified genes with statistically significant changes in the ratios of alternative isoforms between treatments. This set includes several genes previously known to be alternatively spliced or expressed during the defense response, and it may serve as a pool of candidate genes for regulated alternative splicing with possible biological relevance for the defense response against invasive pathogens.}, number={10}, journal={PLOS ONE}, author={Howard, Brian E. and Hu, Qiwen and Babaoglu, Ahmet Can and Chandra, Manan and Borghi, Monica and Tan, Xiaoping and He, Luyan and Winter-Sederoff, Heike and Gassmann, Walter and Veronese, Paola and et al.}, year={2013}, month={Oct} } @article{chang_georgianna_heber_payne_muddiman_2010, title={Detection of Alternative Splice Variants at the Proteome Level in Aspergillus flavus}, volume={9}, ISSN={1535-3893 1535-3907}, url={http://dx.doi.org/10.1021/pr900602d}, DOI={10.1021/pr900602d}, abstractNote={Identification of proteins from proteolytic peptides or intact proteins plays an essential role in proteomics. Researchers use search engines to match the acquired peptide sequences to the target proteins. However, search engines depend on protein databases to provide candidates for consideration. Alternative splicing (AS), the mechanism where the exon of pre-mRNAs can be spliced and rearranged to generate distinct mRNA and therefore protein variants, enable higher eukaryotic organisms, with only a limited number of genes, to have the requisite complexity and diversity at the proteome level. Multiple alternative isoforms from one gene often share common segments of sequences. However, many protein databases only include a limited number of isoforms to keep minimal redundancy. As a result, the database search might not identify a target protein even with high quality tandem MS data and accurate intact precursor ion mass. We computationally predicted an exhaustive list of putative isoforms of Aspergillus flavus proteins from 20 371 expressed sequence tags to investigate whether an alternative splicing protein database can assign a greater proportion of mass spectrometry data. The newly constructed AS database provided 9807 new alternatively spliced variants in addition to 12 832 previously annotated proteins. The searches of the existing tandem MS spectra data set using the AS database identified 29 new proteins encoded by 26 genes. Nine fungal genes appeared to have multiple protein isoforms. In addition to the discovery of splice variants, AS database also showed potential to improve genome annotation. In summary, the introduction of an alternative splicing database helps identify more proteins and unveils more information about a proteome.}, number={3}, journal={Journal of Proteome Research}, publisher={American Chemical Society (ACS)}, author={Chang, Kung-Yen and Georgianna, D. Ryan and Heber, Steffen and Payne, Gary A. and Muddiman, David C.}, year={2010}, month={Mar}, pages={1209–1217} } @article{howard_heber_2010, title={Towards reliable isoform quantification using RNA-SEQ data}, volume={11}, ISSN={["1471-2105"]}, DOI={10.1186/1471-2105-11-s3-s6}, abstractNote={In eukaryotes, alternative splicing often generates multiple splice variants from a single gene. Here we explore the use of RNA sequencing (RNA-Seq) datasets to address the isoform quantification problem. Given a set of known splice variants, the goal is to estimate the relative abundance of the individual variants.Our method employs a linear models framework to estimate the ratios of known isoforms in a sample. A key feature of our method is that it takes into account the non-uniformity of RNA-Seq read positions along the targeted transcripts.Preliminary tests indicate that the model performs well on both simulated and real data. In two publicly available RNA-Seq datasets, we identified several alternatively-spliced genes with switch-like, on/off expression properties, as well as a number of other genes that varied more subtly in isoform expression. In many cases, genes exhibiting differential expression of alternatively spliced transcripts were not differentially expressed at the gene level.Given that changes in isoform expression level frequently involve a continuum of isoform ratios, rather than all-or-nothing expression, and that they are often independent of general gene expression changes, we anticipate that our research will contribute to revealing a so far uninvestigated layer of the transcriptome. We believe that, in the future, researchers will prioritize genes for functional analysis based not only on observed changes in gene expression levels, but also on changes in alternative splicing.}, journal={BMC BIOINFORMATICS}, author={Howard, Brian E. and Heber, Steffen}, year={2010} } @inbook{zhao_kim_heber_2009, title={Analysis of Cis-Regulatory Motifs in Cassette Exons by Incorporating Exon Skipping Rates}, ISBN={9783642015502 9783642015519}, ISSN={0302-9743 1611-3349}, url={http://dx.doi.org/10.1007/978-3-642-01551-9_27}, DOI={10.1007/978-3-642-01551-9_27}, abstractNote={Identification of cis-regulatory motifs has long been a hotspot in the study of alternative splicing. We propose a two-step approach: we first identify k-mer seed motifs by testing for enrichment and significant differences in exon skipping rate, then a local stochastic search is applied to refine the seed motifs. Our approach is especially suitable to discover short and degenerate motifs. We applied our method to a dataset of CNS-specific cassette exons in mouse and discovered 15 motifs. Two of these motifs are highly similar to validated motifs, Nova and hnRNP A1 binding sites. Four motifs show positional bias relative to the splice sites. Our study provides a dictionary of sequence motifs involved in the regulation of alternative splicing in CNS tissues, and a novel tool to detect such motifs.}, booktitle={Bioinformatics Research and Applications}, publisher={Springer Berlin Heidelberg}, author={Zhao, Sihui and Kim, Jihye and Heber, Steffen}, year={2009}, pages={272–283} } @article{heber_mayr_stoye_2011, title={Common Intervals of Multiple Permutations}, volume={60}, ISSN={["1432-0541"]}, DOI={10.1007/s00453-009-9332-1}, number={2}, journal={ALGORITHMICA}, author={Heber, Steffen and Mayr, Richard and Stoye, Jens}, year={2011}, month={Jun}, pages={175–206} } @inbook{kim_zhao_howard_heber_2009, title={Mining of cis-Regulatory Motifs Associated with Tissue-Specific Alternative Splicing}, ISBN={9783642015502 9783642015519}, ISSN={0302-9743 1611-3349}, url={http://dx.doi.org/10.1007/978-3-642-01551-9_26}, DOI={10.1007/978-3-642-01551-9_26}, abstractNote={Alternative splicing (AS) is an important post-transcriptional mechanism that can increase protein diversity and affect mRNA stability and translation efficiency. Many studies targeting the regulation of alternative splicing have focused on individual motifs; however, little is known about how such motifs work in concert. In this paper, we use distribution-based quantitative association rule mining to find combinatorial cis-regulatory motifs and to investigate the effect of motif pairs. We also show that motifs that occur in motif pairs typically occur in clusters.}, booktitle={Bioinformatics Research and Applications}, publisher={Springer Berlin Heidelberg}, author={Kim, Jihye and Zhao, Sihui and Howard, Brian E. and Heber, Steffen}, year={2009}, pages={260–271} } @inbook{howard_sick_heber_2009, title={Practical Quality Assessment of Microarray Data by Simulation of Differential Gene Expression}, ISBN={9783642015502 9783642015519}, ISSN={0302-9743 1611-3349}, url={http://dx.doi.org/10.1007/978-3-642-01551-9_3}, DOI={10.1007/978-3-642-01551-9_3}, abstractNote={There are many methods for assessing the quality of microarray data, but little guidance regarding what to do when defective data is identified. Depending on the scientific question asked, discarding flawed data from a small experiment may be detrimental. Here we describe a novel quality assessment method that is designed to identify chips that should be discarded from an experiment. This technique simulates a set of differentially expressed genes and then assesses whether discarding each chip enhances or obscures the recovery of this known set. We compare our method to expert annotations derived using popular quality diagnostics and show, with examples, that the decision to discard a chip depends on the details of the particular experiment.}, booktitle={Bioinformatics Research and Applications}, publisher={Springer Berlin Heidelberg}, author={Howard, Brian E. and Sick, Beate and Heber, Steffen}, year={2009}, pages={18–27} } @article{wheeler_heimberg_moy_sperling_holstein_heber_peterson_2009, title={The deep evolution of metazoan microRNAs}, volume={11}, ISSN={["1525-142X"]}, DOI={10.1111/j.1525-142X.2008.00302.x}, abstractNote={SUMMARYmicroRNAs (miRNAs) are approximately 22‐nucleotide noncoding RNA regulatory genes that are key players in cellular differentiation and homeostasis. They might also play important roles in shaping metazoan macroevolution. Previous studies have shown that miRNAs are continuously being added to metazoan genomes through time, and, once integrated into gene regulatory networks, show only rare mutations within the primary sequence of the mature gene product and are only rarely secondarily lost. However, because the conclusions from these studies were largely based on phylogenetic conservation of miRNAs between model systems likeDrosophilaand the taxon of interest, it was unclear if these trends would describe most miRNAs in most metazoan taxa. Here, we describe the shared complement of miRNAs among 18 animal species using a combination of 454 sequencing of small RNA libraries with genomic searches. We show that the evolutionary trends elucidated from the model systems are generally true for all miRNA families and metazoan taxa explored: the continuous addition of miRNA families with only rare substitutions to the mature sequence, and only rare instances of secondary loss. Despite this conservation, we document evolutionary stable shifts to the determination of position 1 of the mature sequence, a phenomenon we call seed shifting, as well as the ability to post‐transcriptionally edit the 5′ end of the mature read, changing the identity of the seed sequence and possibly the repertoire of downstream targets. Finally, we describe a novel type of miRNA in demosponges that, although shows a different pre‐miRNA structure, still shows remarkable conservation of the mature sequence in the two sponge species analyzed. We propose that miRNAs might be excellent phylogenetic markers, and suggest that the advent of morphological complexity might have its roots in miRNA innovation.}, number={1}, journal={EVOLUTION & DEVELOPMENT}, author={Wheeler, Benjamin M. and Heimberg, Alysha M. and Moy, Vanessa N. and Sperling, Erik A. and Holstein, Thomas W. and Heber, Steffen and Peterson, Kevin J.}, year={2009}, pages={50–68} } @misc{shi_sun_li_heber_sederoff_chiang_2010, title={Towards a Systems Approach for Lignin Biosynthesis in Populus trichocarpa: Transcript Abundance and Specificity of the Monolignol Biosynthetic Genes}, volume={51}, ISSN={["1471-9053"]}, DOI={10.1093/pcp/pcp175}, abstractNote={As a step toward a comprehensive description of lignin biosynthesis in Populus trichocarpa, we identified from the genome sequence 95 phenylpropanoid gene models in 10 protein families encoding enzymes for monolignol biosynthesis. Transcript abundance was determined for all 95 genes in xylem, leaf, shoot and phloem using quantitative real-time PCR (qRT-PCR). We identified 23 genes that most probably encode monolignol biosynthesis enzymes during wood formation. Transcripts for 18 of the 23 are abundant and specific to differentiating xylem. We found evidence suggesting functional redundancy at the transcript level for phenylalanine ammonia-lyase (PAL), cinnamate 4-hydroxylase (C4H), 4-coumarate:CoA ligase (4CL), p-hydroxycinnamoyl-CoA:quinate shikimate p-hydroxycinnamoyltransferase (HCT), caffeoyl-CoA O-methyltransferase (CCoAOMT) and coniferyl aldehyde 5-hydroxylase (CAld5H). We carried out an enumeration-based motif identification and discriminant analysis on the promoters of all 95 genes. Five core motifs correctly discriminate the 18 xylem-specific genes from the 77 non-xylem genes. These motifs are similar to promoter elements known to regulate phenylpropanoid gene expression. This work suggests that genes in monolignol biosynthesis are regulated by multiple motifs, often related in sequence.}, number={1}, journal={PLANT AND CELL PHYSIOLOGY}, author={Shi, Rui and Sun, Ying-Hsuan and Li, Quanzi and Heber, Steffen and Sederoff, Ronald and Chiang, Vincent L.}, year={2010}, month={Jan}, pages={144–163} } @article{howard_sick_heber_2009, title={Unsupervised assessment of microarray data quality using a Gaussian mixture model}, volume={10}, ISSN={["1471-2105"]}, DOI={10.1186/1471-2105-10-191}, abstractNote={Quality assessment of microarray data is an important and often challenging aspect of gene expression analysis. This task frequently involves the examination of a variety of summary statistics and diagnostic plots. The interpretation of these diagnostics is often subjective, and generally requires careful expert scrutiny. We show how an unsupervised classification technique based on the Expectation-Maximization (EM) algorithm and the naïve Bayes model can be used to automate microarray quality assessment. The method is flexible and can be easily adapted to accommodate alternate quality statistics and platforms. We evaluate our approach using Affymetrix 3' gene expression and exon arrays and compare the performance of this method to a similar supervised approach. This research illustrates the efficacy of an unsupervised classification approach for the purpose of automated microarray data quality assessment. Since our approach requires only unannotated training data, it is easy to customize and to keep up-to-date as technology evolves. In contrast to other "black box" classification systems, this method also allows for intuitive explanations.}, journal={BMC BIOINFORMATICS}, author={Howard, Brian E. and Sick, Beate and Heber, Steffen}, year={2009}, month={Jun} } @article{cholewa_mantey_heber_hollweg_2010, title={Developmental surface and phonological dysgraphia in German 3rd graders}, volume={23}, ISSN={0922-4777 1573-0905}, url={http://dx.doi.org/10.1007/S11145-008-9153-7}, DOI={10.1007/S11145-008-9153-7}, number={1}, journal={Reading and Writing}, publisher={Springer Science and Business Media LLC}, author={Cholewa, Jürgen and Mantey, Stefanie and Heber, Stefanie and Hollweg, Wibke}, year={2010}, month={Jan}, pages={97–127} } @article{zhi_keich_pevzner_heber_tang_2007, title={Correcting base-assignment errors in repeat regions of shotgun assembly}, volume={4}, ISSN={["1557-9964"]}, DOI={10.1109/TCBB.2007.1005}, abstractNote={Accurate base-assignment in repeat regions of a whole genome shotgun assembly is an unsolved problem. Since reads in repeat regions cannot be easily attributed to a unique location in the genome, current assemblers may place these reads arbitrarily. As a result, the base-assignment error rate in repeats is likely to be much higher than that in the rest of the genome. We developed an iterative algorithm, EULER-AIR, that is able to correct base-assignment errors in finished genome sequences in public databases. The Wolbachia genome is among the best finished genomes. Using this genome project as an example, we demonstrated that EULER-AIR can 1) discover and correct base-assignment errors, 2) provide accurate read assignments, 3) utilize finishing reads for accurate base-assignment, and 4) provide guidance for designing finishing experiments. In the genome of Wolbachia, EULER-AIR found 16 positions with ambiguous base-assignment and two positions with erroneous bases. Besides Wolbachia, many other genome sequencing projects have significantly fewer finishing reads and, hence, are likely to contain more base-assignment errors in repeats. We demonstrate that EULER-AIR is a software tool that can be used to find and correct base-assignment errors in a genome assembly project}, number={1}, journal={IEEE-ACM TRANSACTIONS ON COMPUTATIONAL BIOLOGY AND BIOINFORMATICS}, author={Zhi, Degui and Keich, Uri and Pevzner, Pavel and Heber, Steffen and Tang, Haixu}, year={2007}, pages={54–64} } @article{li_andersen_heber_zhang_2007, title={Non-monotonic dose-response relationship in steroid hormone receptor-mediated gene expression}, volume={38}, ISSN={["1479-6813"]}, DOI={10.1677/JME-07-0003}, abstractNote={Steroid hormone receptors are the targets of many environmental endocrine active chemicals (EACs) and synthetic drugs used in hormone therapy. While most of these chemical compounds have a unidirectional and monotonic effect, certain EACs can display non-monotonic dose–response behaviors and some synthetic drugs are selective endocrine modulators. Mechanisms underlying these complex endocrine behaviors have not been fully understood. By formulating an ordinary differential equation-based computational model, we investigated in this study the steady-state dose–response behavior of exogenous steroid ligands in an endogenous hormonal background under various parameter conditions. Our simulation revealed that non-monotonic dose–responses in gene expression can arise within the classical genomic framework of steroid signaling. Specifically, when the exogenous ligand is an agonist, a U-shaped dose–response appears as a result of the inherently nonlinear process of receptor homodimerization. This U-shaped dose–response curve can be further modulated by mixed-ligand heterodimers formed between endogenous ligand-bound and exogenous ligand-bound receptor monomers. When the heterodimer is transcriptionally inactive or repressive, the magnitude of U-shape increases; conversely, when the heterodimer is transcriptionally active, the magnitude of U-shape decreases. Additionally, we found that an inverted U-shaped dose–response can arise when the heterodimer is a strong transcription activator regardless of whether the exogenous ligand is an agonist or antagonist. Our work provides a novel mechanism for non-monotonic, particularly U-shaped, dose–response behaviors observed with certain steroid mimics, and may help not only understand how selective steroid receptor modulators work but also improve risk assessment for EACs.}, number={5-6}, journal={JOURNAL OF MOLECULAR ENDOCRINOLOGY}, author={Li, Li and Andersen, Melvin E. and Heber, Steffen and Zhang, Qiang}, year={2007}, pages={569–585} } @article{frahm_howard_heber_muddiman_2006, title={Accessible proteomics space and its implications for peak capacity for zero-, one- and two-dimensional separations coupled with FT-ICR and TOF mass spectrometry}, volume={41}, ISSN={["1096-9888"]}, DOI={10.1002/jms.1024}, abstractNote={AbstractThe number and wide dynamic range of components found in biological matrixes present several challenges for global proteomics. In this perspective, we will examine the potential of zero‐dimensional (0D), one‐dimensional (1D), and two‐dimensional (2D) separations coupled with Fourier‐transform ion cyclotron resonance (FT‐ICR) and time‐of‐flight (TOF) mass spectrometry (MS) for the analysis of complex mixtures. We describe and further develop previous reports on the space occupied by peptides, to calculate the theoretical peak capacity available to each separations‐mass spectrometry method examined. Briefly, the peak capacity attainable by each of the mass analyzers was determined from the mass resolving power (RP) and the m/z space occupied by peptides considered from the mass distribution of tryptic peptides from National Center for Biotechnology Information's (NCBI's) nonredundant database. Our results indicate that reverse‐phase‐nanoHPLC (RP‐nHPLC) separation coupled with FT‐ICR MS offers an order of magnitude improvement in peak capacity over RP‐nHPLC separation coupled with TOF MS. The addition of an orthogonal separation method, strong cation exchange (SCX), for 2D LC‐MS demonstrates an additional 10‐fold improvement in peak capacity over 1D LC‐MS methods. Peak capacity calculations for 0D LC, two different 1D RP‐HPLC methods, and 2D LC (with various numbers of SCX fractions) for both RP‐HPLC methods coupled to FT‐ICR and TOF MS are examined in detail. Peak capacity production rates, which take into account the total analysis time, are also considered for each of the methods. Furthermore, the significance of the space occupied by peptides is discussed. Copyright © 2006 John Wiley & Sons, Ltd.}, number={3}, journal={JOURNAL OF MASS SPECTROMETRY}, author={Frahm, JL and Howard, BE and Heber, S and Muddiman, DC}, year={2006}, month={Mar}, pages={281–288} } @article{heber_savage_2005, title={Common intervals of trees}, volume={93}, ISSN={["1872-6119"]}, DOI={10.1016/j.ipl.2004.09.016}, abstractNote={In this survey, we review practical algorithms for graph-theoretic problems that are expressible in monadic second-order logic. Monadic second-order (MSO) logic allows quantifications over unary relations (sets) and can be used to express a host of useful graph properties such as connectivity, c-colorability (for a fixed c), Hamiltonicity and minor inclusion. A celebrated theorem in this area by Courcelle states that any graph problem expressible in MSO can be solved in linear time on graphs that admit a tree-decomposition of constant width. Courcelle’s Theorem has been used thus far as a theoretic tool to establish that linear-time algorithms exist for graph problems by demonstrating that the problem in question is expressible by an MSO formula. A straightforward implementation of the algorithm in the proof of Courcelle’s Theorem is useless as it runs into space-explosion problems even for small values of treewidth. Of late, there have been several attempts to circumvent these problems and we review some of these in this survey. This survey also introduces the reader to the notions of tree-decompositions and the basics of monadic second order logic.}, number={2}, journal={INFORMATION PROCESSING LETTERS}, author={Heber, S and Savage, CD}, year={2005}, month={Jan}, pages={69–74} } @article{psarros_heber_sick_thoppae_harshman_sick_2005, title={RACE: Remote Analysis Computation for gene Expression data}, volume={33}, ISSN={["1362-4962"]}, DOI={10.1093/nar/gki490}, abstractNote={The Remote Analysis Computation for gene Expression data (RACE) suite is a collection of bioinformatics web tools designed for the analysis of DNA microarray data. RACE performs probe-level data preprocessing, extensive quality checks, data visualization and data normalization for Affymetrix GeneChips. In addition, it offers differential expression analysis on normalized expression levels from any array platform. RACE estimates the false discovery rates of lists of potentially regulated genes and provides a Gene Ontology-term analysis tool for GeneChip data to support the biological interpretation and annotation of results. The analysis is fully automated but can be customized by flexible parameter settings. To offer a convenient starting point for subsequent analyses, and to provide maximum transparency, the R scripts used to generate the results can be downloaded along with the output files. RACE is freely available for use at .}, journal={NUCLEIC ACIDS RESEARCH}, author={Psarros, M and Heber, S and Sick, M and Thoppae, G and Harshman, K and Sick, B}, year={2005}, month={Jul}, pages={W638–W643} } @article{leipzig_pevzner_heber_2004, title={The alternative splicing gallery (ASG): bridging the gap between genome and transcriptome}, volume={32}, DOI={10.1039/nar/gkh731}, number={13}, journal={Nucleic Acids Research}, author={Leipzig, J. and Pevzner, P. and Heber, Steffen}, year={2004}, pages={3977–3983} } @inbook{heber_stoye_2001, title={Finding All Common Intervals of k Permutations}, ISBN={9783540422716 9783540481942}, ISSN={0302-9743 1611-3349}, url={http://dx.doi.org/10.1007/3-540-48194-x_19}, DOI={10.1007/3-540-48194-x_19}, abstractNote={Given k permutations of n elements, a k-tuple of intervals of these permutations consisting of the same set of elements is called a common interval. We present an algorithm that finds in a family of k permutations of n elements all K common intervals in optimal O(nk+K) time and O(n) additional space.This extends a result by Uno and Yagiura (Algorithmica 26, 290-309, 2000) who present an algorithm to find all K common intervals of k = 2 permutations in optimal O(n+K) time and O(n) space. To achieve our result, we introduce the set of irreducible intervals, a generating subset of the set of all common intervals of k permutations.}, booktitle={Combinatorial Pattern Matching}, publisher={Springer Berlin Heidelberg}, author={Heber, Steffen and Stoye, Jens}, year={2001}, pages={207–218} }