@article{huynh_cang_2024, title={Topological and geometric analysis of cell states in single-cell transcriptomic data}, url={https://doi.org/10.1093/bib/bbae176}, DOI={10.1093/bib/bbae176}, abstractNote={Single-cell RNA sequencing (scRNA-seq) enables dissecting cellular heterogeneity in tissues, resulting in numerous biological discoveries. Various computational methods have been devised to delineate cell types by clustering scRNA-seq data, where clusters are often annotated using prior knowledge of marker genes. In addition to identifying pure cell types, several methods have been developed to identify cells undergoing state transitions, which often rely on prior clustering results. The present computational approaches predominantly investigate the local and first-order structures of scRNA-seq data using graph representations, while scRNA-seq data frequently display complex high-dimensional structures. Here, we introduce scGeom, a tool that exploits the multiscale and multidimensional structures in scRNA-seq data by analyzing the geometry and topology through curvature and persistent homology of both cell and gene networks. We demonstrate the utility of these structural features to reflect biological properties and functions in several applications, where we show that curvatures and topological signatures of cell and gene networks can help indicate transition cells and the differentiation potential of cells. We also illustrate that structural characteristics can improve the classification of cell types.}, author={Huynh, Tram and Cang, Zixuan}, year={2024}, month={Apr} } @article{cang_nie_2023, title={A mathematical method and software for spatially mapping intercellular communication}, volume={1}, ISSN={["1548-7105"]}, DOI={10.1038/s41592-022-01729-3}, journal={NATURE METHODS}, author={Cang, Zixuan and Nie, Qing}, year={2023}, month={Jan} } @article{dover_cang_ma_nie_vershynin_2023, title={AVIDA: An alternating method for visualizing and integrating data}, volume={68}, ISSN={1877-7511}, DOI={10.1016/j.jocs.2023.101998}, abstractNote={High-dimensional multimodal data arise in many scientific fields. The integration of multimodal data becomes challenging when there is no known correspondence between the samples and the features of different datasets. To tackle this challenge, we introduce AVIDA, a framework for simultaneously performing data alignment and dimension reduction. In the numerical experiments, Gromov–Wasserstein optimal transport and t-distributed stochastic neighbor embedding are used as the alignment and dimension reduction modules respectively. We show that by alternating dimension reduction and alignment, AVIDA aligns the representations of high-dimensional datasets without common features with four synthesized datasets and two real multimodal single-cell datasets. Compared to several existing methods, we demonstrate that AVIDA better preserves structures of individual datasets, especially distinct local structures in the joint low-dimensional representation, while achieving comparable alignment performance. Such a property is important in multimodal single-cell data analysis as some biological processes are uniquely captured by one of the datasets. In general applications, other methods can be used for the alignment and dimension reduction modules.}, journal={JOURNAL OF COMPUTATIONAL SCIENCE}, author={Dover, Kathryn and Cang, Zixuan and Ma, Anna and Nie, Qing and Vershynin, Roman}, year={2023}, month={Apr}, pages={101998} } @article{cang_zhao_almet_stabell_ramos_plikus_atwood_nie_2023, title={Screening cell-cell communication in spatial transcriptomics via collective optimal transport}, volume={1}, ISSN={["1548-7105"]}, url={https://doi.org/10.1038/s41592-022-01728-4}, DOI={10.1038/s41592-022-01728-4}, abstractNote={Spatial transcriptomic technologies and spatially annotated single-cell RNA sequencing datasets provide unprecedented opportunities to dissect cell-cell communication (CCC). However, incorporation of the spatial information and complex biochemical processes required in the reconstruction of CCC remains a major challenge. Here, we present COMMOT (COMMunication analysis by Optimal Transport) to infer CCC in spatial transcriptomics, which accounts for the competition between different ligand and receptor species as well as spatial distances between cells. A collective optimal transport method is developed to handle complex molecular interactions and spatial constraints. Furthermore, we introduce downstream analysis tools to infer spatial signaling directionality and genes regulated by signaling using machine learning models. We apply COMMOT to simulation data and eight spatial datasets acquired with five different technologies to show its effectiveness and robustness in identifying spatial CCC in data with varying spatial resolutions and gene coverages. Finally, COMMOT identifies new CCCs during skin morphogenesis in a case study of human epidermal development.}, journal={NATURE METHODS}, author={Cang, Zixuan and Zhao, Yanxiang and Almet, Axel A. A. and Stabell, Adam and Ramos, Raul and Plikus, Maksim V. V. and Atwood, Scott X. X. and Nie, Qing}, year={2023}, month={Jan} } @article{walker_cang_ren_bourgain-chang_nie_2022, title={Deciphering tissue structure and function using spatial transcriptomics}, url={https://doi.org/10.1038/s42003-022-03175-5}, DOI={10.1038/s42003-022-03175-5}, abstractNote={Abstract The rapid development of spatial transcriptomics (ST) techniques has allowed the measurement of transcriptional levels across many genes together with the spatial positions of cells. This has led to an explosion of interest in computational methods and techniques for harnessing both spatial and transcriptional information in analysis of ST datasets. The wide diversity of approaches in aim, methodology and technology for ST provides great challenges in dissecting cellular functions in spatial contexts. Here, we synthesize and review the key problems in analysis of ST data and methods that are currently applied, while also expanding on open questions and areas of future development.}, journal={Communications Biology}, author={Walker, Benjamin L. and Cang, Zixuan and Ren, Honglei and Bourgain-Chang, Eric and Nie, Qing}, year={2022}, month={Mar} } @article{ren_walker_cang_nie_2022, title={Identifying multicellular spatiotemporal organization of cells with SpaceFlow}, volume={13}, ISSN={["2041-1723"]}, url={https://doi.org/10.1038/s41467-022-31739-w}, DOI={10.1038/s41467-022-31739-w}, abstractNote={Abstract One major challenge in analyzing spatial transcriptomic datasets is to simultaneously incorporate the cell transcriptome similarity and their spatial locations. Here, we introduce SpaceFlow, which generates spatially-consistent low-dimensional embeddings by incorporating both expression similarity and spatial information using spatially regularized deep graph networks. Based on the embedding, we introduce a pseudo-Spatiotemporal Map that integrates the pseudotime concept with spatial locations of the cells to unravel spatiotemporal patterns of cells. By comparing with multiple existing methods on several spatial transcriptomic datasets at both spot and single-cell resolutions, SpaceFlow is shown to produce a robust domain segmentation and identify biologically meaningful spatiotemporal patterns. Applications of SpaceFlow reveal evolving lineage in heart developmental data and tumor-immune interactions in human breast cancer data. Our study provides a flexible deep learning framework to incorporate spatiotemporal information in analyzing spatial transcriptomic data.}, number={1}, journal={NATURE COMMUNICATIONS}, author={Ren, Honglei and Walker, Benjamin L. and Cang, Zixuan and Nie, Qing}, year={2022}, month={Jul} } @article{cang_zhao_almet_stabell_ramos_plikus_atwood_nie_2022, title={Screening cell-cell communication in spatial transcriptomics via collective optimal transport}, url={https://doi.org/10.1101/2022.08.24.505185}, DOI={10.1101/2022.08.24.505185}, abstractNote={Spatial transcriptomic technologies and spatially annotated single cell RNA-sequencing (scRNA-seq) datasets provide unprecedented opportunities to dissect cell-cell communication (CCC). How to incorporate the spatial information and complex biochemical processes in reconstructing CCC remains a major challenge. Here we present COMMOT to infer CCC in spatial transcriptomics, which accounts for the competition among different ligand and receptor species as well as spatial distances between cells. A novel collective optimal transport method is developed to handle complex molecular interactions and spatial constraints. We introduce downstream analysis tools on spatial directionality of signalings and genes regulated by such signalings using machine learning models. We apply COMMOT to simulation data and eight spatial datasets acquired with five different technologies, showing its effectiveness and robustness in identifying spatial CCC in data with varying spatial resolutions and gene coverages. Finally, COMMOT reveals new CCCs during skin morphogenesis in a case study of human epidermal development. Both the method and the computational package have broad applications in inferring cell-cell interactions within spatial genomics datasets.}, author={Cang, Zixuan and Zhao, Yanxiang and Almet, Axel A. and Stabell, Adam and Ramos, Raul and Plikus, Maksim and Atwood, Scott X. and Nie, Qing}, year={2022}, month={Aug} } @article{cang_wang_wang_cho_holmes_nie_2021, title={A multiscale model via single-cell transcriptomics reveals robust patterning mechanisms during early mammalian embryo development}, volume={17}, url={https://doi.org/10.1371/journal.pcbi.1008571}, DOI={10.1371/journal.pcbi.1008571}, abstractNote={During early mammalian embryo development, a small number of cells make robust fate decisions at particular spatial locations in a tight time window to form inner cell mass (ICM), and later epiblast (Epi) and primitive endoderm (PE). While recent single-cell transcriptomics data allows scrutinization of heterogeneity of individual cells, consistent spatial and temporal mechanisms the early embryo utilize to robustly form the Epi/PE layers from ICM remain elusive. Here we build a multiscale three-dimensional model for mammalian embryo to recapitulate the observed patterning process from zygote to late blastocyst. By integrating the spatiotemporal information reconstructed from multiple single-cell transcriptomic datasets, the data-informed modeling analysis suggests two major processes critical to the formation of Epi/PE layers: a selective cell-cell adhesion mechanism (via EphA4/EphrinB2) for fate-location coordination and a temporal attenuation mechanism of cell signaling (via Fgf). Spatial imaging data and distinct subsets of single-cell gene expression data are then used to validate the predictions. Together, our study provides a multiscale framework that incorporates single-cell gene expression datasets to analyze gene regulations, cell-cell communications, and physical interactions among cells in complex geometries at single-cell resolution, with direct application to late-stage development of embryogenesis.}, number={3}, journal={PLOS Computational Biology}, publisher={Public Library of Science (PLoS)}, author={Cang, Zixuan and Wang, Yangyang and Wang, Qixuan and Cho, Ken W. Y. and Holmes, William and Nie, Qing}, editor={Umulis, DavidEditor}, year={2021}, month={Mar}, pages={e1008571} } @article{zhu_du_nomura_gao_cang_wei_gordon_gurevitz_groome_dong_2021, title={Charge substitutions at the voltage-sensing module of domain III enhance actions of site-3 and site-4 toxins on an insect sodium channel}, volume={137}, url={https://doi.org/10.1016/j.ibmb.2021.103625}, DOI={10.1016/j.ibmb.2021.103625}, abstractNote={Scorpion α-toxins bind at the pharmacologically-defined site-3 on the sodium channel and inhibit channel inactivation by preventing the outward movement of the voltage sensor in domain IV (IVS4), whereas scorpion β-toxins bind at site-4 on the sodium channel and enhance channel activation by trapping the voltage sensor of domain II (IIS4) in its outward position. However, limited information is available on the role of the voltage-sensing modules (VSM, comprising S1-S4) of domains I and III in toxin actions. We have previously shown that charge reversing substitutions of the innermost positively-charged residues in IIIS4 (R4E, R5E) increase the activity of an insect-selective site-4 scorpion toxin, Lqh-dprIT3-c, on BgNav1-1a, a cockroach sodium channel. Here we show that substitutions R4E and R5E in IIIS4 also increase the activity of two site-3 toxins, LqhαIT from Leiurusquinquestriatus hebraeus and insect-selective Av3 from Anemonia viridis. Furthermore, charge reversal of either of two conserved negatively-charged residues, D1K and E2K, in IIIS2 also increase the action of the site-3 and site-4 toxins. Homology modeling suggests that S2-D1 and S2-E2 interact with S4-R4 and S4-R5 in the VSM of domain III (III-VSM), respectively, in the activated state of the channel. However, charge swapping between S2-D1 and S4-R4 had no compensatory effects on gating or toxin actions, suggesting that charged residue interactions are complex. Collectively, our results highlight the involvement of III-VSM in the actions of both site 3 and site 4 toxins, suggesting that charge reversing substitutions in III-VSM allosterically facilitate IIS4 or IVS4 voltage sensor trapping by these toxins.}, journal={Insect Biochemistry and Molecular Biology}, publisher={Elsevier BV}, author={Zhu, Qing and Du, Yuzhe and Nomura, Yoshiko and Gao, Rong and Cang, Zixuan and Wei, Guo-Wei and Gordon, Dalia and Gurevitz, Michael and Groome, James and Dong, Ke}, year={2021}, month={Oct}, pages={103625} } @article{maseda_cang_nie_2021, title={DEEPsc: A Deep Learning-Based Map Connecting Single-Cell Transcriptomics and Spatial Imaging Data}, volume={12}, url={http://dx.doi.org/10.3389/fgene.2021.636743}, DOI={10.3389/fgene.2021.636743}, abstractNote={Single-cell RNA sequencing (scRNA-seq) data provides unprecedented information on cell fate decisions; however, the spatial arrangement of cells is often lost. Several recent computational methods have been developed to impute spatial information onto a scRNA-seq dataset through analyzing known spatial expression patterns of a small subset of genes known as a reference atlas. However, there is a lack of comprehensive analysis of the accuracy, precision, and robustness of the mappings, along with the generalizability of these methods, which are often designed for specific systems. We present a system-adaptive deep learning-based method (DEEPsc) to impute spatial information onto a scRNA-seq dataset from a given spatial reference atlas. By introducing a comprehensive set of metrics that evaluate the spatial mapping methods, we compare DEEPsc with four existing methods on four biological systems. We find that while DEEPsc has comparable accuracy to other methods, an improved balance between precision and robustness is achieved. DEEPsc provides a data-adaptive tool to connect scRNA-seq datasets and spatial imaging datasets to analyze cell fate decisions. Our implementation with a uniform API can serve as a portal with access to all the methods investigated in this work for spatial exploration of cell fate decisions in scRNA-seq data. All methods evaluated in this work are implemented as an open-source software with a uniform interface.}, journal={Frontiers in Genetics}, publisher={Frontiers Media SA}, author={Maseda, Floyd and Cang, Zixuan and Nie, Qing}, year={2021}, month={Mar} } @article{tatarakis_cang_wu_sharma_karikomi_maclean_nie_schilling_2021, title={Single-cell transcriptomic analysis of zebrafish cranial neural crest reveals spatiotemporal regulation of lineage decisions during development}, url={https://doi.org/10.1016/j.celrep.2021.110140}, DOI={10.1016/j.celrep.2021.110140}, abstractNote={Neural crest (NC) cells migrate throughout vertebrate embryos to give rise to a huge variety of cell types, but when and where lineages emerge and their regulation remain unclear. We have performed single-cell RNA sequencing (RNA-seq) of cranial NC cells from the first pharyngeal arch in zebrafish over several stages during migration. Computational analysis combining pseudotime and real-time data reveals that these NC cells first adopt a transitional state, becoming specified mid-migration, with the first lineage decisions being skeletal and pigment, followed by neural and glial progenitors. In addition, by computationally integrating these data with RNA-seq data from a transgenic Wnt reporter line, we identify gene cohorts with similar temporal responses to Wnts during migration and show that one, Atp6ap2, is required for melanocyte differentiation. Together, our results show that cranial NC cell lineages arise progressively and uncover a series of spatially restricted cell interactions likely to regulate such cell-fate decisions.}, journal={Cell Reports}, author={Tatarakis, David and Cang, Zixuan and Wu, Xiaojun and Sharma, Praveer P. and Karikomi, Matthew and MacLean, Adam L. and Nie, Qing and Schilling, Thomas F.}, year={2021}, month={Dec} } @article{almet_cang_jin_nie_2021, title={The landscape of cell–cell communication through single-cell transcriptomics}, url={https://doi.org/10.1016/j.coisb.2021.03.007}, DOI={10.1016/j.coisb.2021.03.007}, abstractNote={Cell-cell communication is a fundamental process that shapes biological tissue. Historically, studies of cell-cell communication have been feasible for one or two cell types and a few genes. With the emergence of single-cell transcriptomics, we are now able to examine the genetic profiles of individual cells at unprecedented scale and depth. The availability of such data presents an exciting opportunity to construct a more comprehensive description of cell-cell communication. This review discusses the recent explosion of methods that have been developed to infer cell-cell communication from non-spatial and spatial single-cell transcriptomics, two promising technologies which have complementary strengths and limitations. We propose several avenues to propel this rapidly expanding field forward in meaningful ways.}, journal={Current Opinion in Systems Biology}, author={Almet, Axel A. and Cang, Zixuan and Jin, Suoqin and Nie, Qing}, year={2021}, month={Jun} } @article{nguyen_cang_wei_2020, title={A review of mathematical representations of biomolecular data}, url={https://doi.org/10.1039/C9CP06554G}, DOI={10.1039/C9CP06554G}, abstractNote={Recently, machine learning (ML) has established itself in various worldwide benchmarking competitions in computational biology, including Critical Assessment of Structure Prediction (CASP) and Drug Design Data Resource (D3R) Grand Challenges. However, the intricate structural complexity and high ML dimensionality of biomolecular datasets obstruct the efficient application of ML algorithms in the field. In addition to data and algorithm, an efficient ML machinery for biomolecular predictions must include structural representation as an indispensable component. Mathematical representations that simplify the biomolecular structural complexity and reduce ML dimensionality have emerged as a prime winner in D3R Grand Challenges. This review is devoted to the recent advances in developing low-dimensional and scalable mathematical representations of biomolecules in our laboratory. We discuss three classes of mathematical approaches, including algebraic topology, differential geometry, and graph theory. We elucidate how the physical and biological challenges have guided the evolution and development of these mathematical apparatuses for massive and diverse biomolecular data. We focus the performance analysis on protein-ligand binding predictions in this review although these methods have had tremendous success in many other applications, such as protein classification, virtual screening, and the predictions of solubility, solvation free energies, toxicity, partition coefficients, protein folding stability changes upon mutation, etc.}, journal={Physical Chemistry Chemical Physics}, publisher={Royal Society of Chemistry (RSC)}, author={Nguyen, Duc Duy and Cang, Zixuan and Wei, Guo-Wei}, year={2020} } @article{wang_cang_wei_2020, title={A topology-based network tree for the prediction of protein–protein binding affinity changes following mutation}, url={https://doi.org/10.1038/s42256-020-0149-6}, DOI={10.1038/s42256-020-0149-6}, abstractNote={The ability to predict protein–protein interactions is crucial to our understanding of a wide range of biological activities and functions in the human body, and for guiding drug discovery. Despite considerable efforts to develop suitable computational methods, predicting protein–protein interaction binding affinity changes following mutation (ΔΔG) remains a severe challenge. Algebraic topology, a champion in recent worldwide competitions for protein–ligand binding affinity predictions, is a promising approach to simplifying the complexity of biological structures. Here we introduce element- and site-specific persistent homology (a new branch of algebraic topology) to simplify the structural complexity of protein–protein complexes and embed crucial biological information into topological invariants. We also propose a new deep learning algorithm called NetTree to take advantage of convolutional neural networks and gradient-boosting trees. A topology-based network tree is constructed by integrating the topological representation and NetTree for predicting protein–protein interaction ΔΔG. Tests on major benchmark datasets indicate that the proposed topology-based network tree is an important improvement over the current state of the art in predicting ΔΔG. Persistent homology provides an efficient approach to simplifying the complexity of protein structure. Wang et al. combine this approach with convolutional neural networks and gradient-boosting trees to improve predictions of protein–protein interactions.}, journal={Nature Machine Intelligence}, author={Wang, Menglun and Cang, Zixuan and Wei, Guo-Wei}, year={2020}, month={Feb} } @article{haensel_jin_sun_cinco_dragan_nguyen_cang_gong_vu_maclean_et al._2020, title={Defining Epidermal Basal Cell States during Skin Homeostasis and Wound Healing Using Single-Cell Transcriptomics}, volume={30}, url={http://dx.doi.org/10.1016/j.celrep.2020.02.091}, DOI={10.1016/j.celrep.2020.02.091}, abstractNote={Our knowledge of transcriptional heterogeneities in epithelial stem and progenitor cell compartments is limited. Epidermal basal cells sustain cutaneous tissue maintenance and drive wound healing. Previous studies have probed basal cell heterogeneity in stem and progenitor potential, but a comprehensive dissection of basal cell dynamics during differentiation is lacking. Using single-cell RNA sequencing coupled with RNAScope and fluorescence lifetime imaging, we identify three non-proliferative and one proliferative basal cell state in homeostatic skin that differ in metabolic preference and become spatially partitioned during wound re-epithelialization. Pseudotemporal trajectory and RNA velocity analyses predict a quasi-linear differentiation hierarchy where basal cells progress from Col17a1Hi/Trp63Hi state to early-response state, proliferate at the juncture of these two states, or become growth arrested before differentiating into spinous cells. Wound healing induces plasticity manifested by dynamic basal-spinous interconversions at multiple basal transcriptional states. Our study provides a systematic view of epidermal cellular dynamics, supporting a revised “hierarchical-lineage” model of homeostasis.}, number={11}, journal={Cell Reports}, publisher={Elsevier BV}, author={Haensel, Daniel and Jin, Suoqin and Sun, Peng and Cinco, Rachel and Dragan, Morgan and Nguyen, Quy and Cang, Zixuan and Gong, Yanwen and Vu, Remy and MacLean, Adam L. and et al.}, year={2020}, month={Mar}, pages={3932–3947.e6} } @article{cang_munch_wei_2020, title={Evolutionary homology on coupled dynamical systems with applications to protein flexibility analysis}, volume={4}, url={http://dx.doi.org/10.1007/s41468-020-00057-9}, DOI={10.1007/s41468-020-00057-9}, abstractNote={While the spatial topological persistence is naturally constructed from a radius-based filtration, it has hardly been derived from a temporal filtration. Most topological models are designed for the global topology of a given object as a whole. There is no method reported in the literature for the topology of an individual component in an object to the best of our knowledge. For many problems in science and engineering, the topology of an individual component is important for describing its properties. We propose evolutionary homology (EH) constructed via a time evolution-based filtration and topological persistence. Our approach couples a set of dynamical systems or chaotic oscillators by the interactions of a physical system, such as a macromolecule. The interactions are approximated by weighted graph Laplacians. Simplices, simplicial complexes, algebraic groups and topological persistence are defined on the coupled trajectories of the chaotic oscillators. The resulting EH gives rise to time-dependent topological invariants or evolutionary barcodes for an individual component of the physical system, revealing its topology-function relationship. In conjunction with Wasserstein metrics, the proposed EH is applied to protein flexibility analysis, an important problem in computational biophysics. Numerical results for the B-factor prediction of a benchmark set of 364 proteins indicate that the proposed EH outperforms all the other state-of-the-art methods in the field.}, number={4}, journal={Journal of Applied and Computational Topology}, publisher={Springer Science and Business Media LLC}, author={Cang, Zixuan and Munch, Elizabeth and Wei, Guo-Wei}, year={2020}, month={Dec}, pages={481–507} } @article{cang_nie_2020, title={Inferring spatial and signaling relationships between cells from single cell transcriptomic data}, url={https://doi.org/10.1038/s41467-020-15968-5}, DOI={10.1038/s41467-020-15968-5}, abstractNote={Abstract Single-cell RNA sequencing (scRNA-seq) provides details for individual cells; however, crucial spatial information is often lost. We present SpaOTsc, a method relying on structured optimal transport to recover spatial properties of scRNA-seq data by utilizing spatial measurements of a relatively small number of genes. A spatial metric for individual cells in scRNA-seq data is first established based on a map connecting it with the spatial measurements. The cell–cell communications are then obtained by “optimally transporting” signal senders to target signal receivers in space. Using partial information decomposition, we next compute the intercellular gene–gene information flow to estimate the spatial regulations between genes across cells. Four datasets are employed for cross-validation of spatial gene expression prediction and comparison to known cell–cell communications. SpaOTsc has broader applications, both in integrating non-spatial single-cell measurements with spatial data, and directly in spatial single-cell transcriptomics data to reconstruct spatial cellular dynamics in tissues.}, journal={Nature Communications}, author={Cang, Zixuan and Nie, Qing}, year={2020}, month={Apr} } @article{cang_wei_2020, title={Persistent Cohomology for Data With Multicomponent Heterogeneous Information}, volume={2}, url={https://doi.org/10.1137/19M1272226}, DOI={10.1137/19M1272226}, abstractNote={Persistent homology is a powerful tool for characterizing the topology of a data set at various geometric scales. When applied to the description of molecular structures, persistent homology can capture the multiscale geometric features and reveal certain interaction patterns in terms of topological invariants. However, in addition to the geometric information, there is a wide variety of nongeometric information of molecular structures, such as element types, atomic partial charges, atomic pairwise interactions, and electrostatic potential functions, that is not described by persistent homology. Although element-specific homology and electrostatic persistent homology can encode some nongeometric information into geometry based topological invariants, it is desirable to have a mathematical paradigm to systematically embed both geometric and nongeometric information, i.e., multicomponent heterogeneous information, into unified topological representations. To this end, we propose a persistent cohomology based framework for the enriched representation of data. In our framework, nongeometric information can either be distributed globally or reside locally on the datasets in the geometric sense and can be properly defined on topological spaces, i.e., simplicial complexes. Using the proposed persistent cohomology based framework, enriched barcodes are extracted from datasets to represent heterogeneous information. We consider a variety of datasets to validate the present formulation and illustrate the usefulness of the proposed method based on persistent cohomology. It is found that the proposed framework outperforms or at least matches the state-of-the-art methods in the protein-ligand binding affinity prediction from massive biomolecular datasets without resorting to any deep learning formulation.}, number={2}, journal={SIAM Journal on Mathematics of Data Science}, publisher={Society for Industrial & Applied Mathematics (SIAM)}, author={Cang, Zixuan and Wei, Guo-Wei}, year={2020}, month={Jan}, pages={396–418} } @article{guo_cang_yao_kim_deans_wei_kang_hong_2020, title={Structural cavities are critical to balancing stability and activity of a membrane-integral enzyme}, volume={117}, url={https://doi.org/10.1073/pnas.1917770117}, DOI={10.1073/pnas.1917770117}, abstractNote={Significance The physical principles of membrane protein folding are not well understood. Because of the lack of water inside the cell membrane, the hydrophobic effect cannot drive the folding of membrane-embedded structural elements. Therefore, van der Waals packing interaction becomes a crucial driving force, which may imply that the membrane protein interior is tightly packed. Paradoxically, membrane proteins such as channels, transporters, receptors, and enzymes require cavities (i.e., voids, pockets, and pores) for function. Then, how do membrane proteins achieve the stability carrying out function? Using experiment and molecular dynamics simulation, we show that cavities in membrane proteins can be stabilized by favorable interaction with surrounding lipid molecules and play a pivotal role in balancing stability and flexibility for function. Packing interaction is a critical driving force in the folding of helical membrane proteins. Despite the importance, packing defects (i.e., cavities including voids, pockets, and pores) are prevalent in membrane-integral enzymes, channels, transporters, and receptors, playing essential roles in function. Then, a question arises regarding how the two competing requirements, packing for stability vs. cavities for function, are reconciled in membrane protein structures. Here, using the intramembrane protease GlpG of Escherichia coli as a model and cavity-filling mutation as a probe, we tested the impacts of native cavities on the thermodynamic stability and function of a membrane protein. We find several stabilizing mutations which induce substantial activity reduction without distorting the active site. Notably, these mutations are all mapped onto the regions of conformational flexibility and functional importance, indicating that the cavities facilitate functional movement of GlpG while compromising the stability. Experiment and molecular dynamics simulation suggest that the stabilization is induced by the coupling between enhanced protein packing and weakly unfavorable lipid desolvation, or solely by favorable lipid solvation on the cavities. Our result suggests that, stabilized by the relatively weak interactions with lipids, cavities are accommodated in membrane proteins without severe energetic cost, which, in turn, serve as a platform to fine-tune the balance between stability and flexibility for optimal activity.}, number={36}, journal={Proceedings of the National Academy of Sciences}, publisher={Proceedings of the National Academy of Sciences}, author={Guo, Ruiqiong and Cang, Zixuan and Yao, Jiaqi and Kim, Miyeon and Deans, Erin and Wei, Guowei and Kang, Seung-gu and Hong, Heedeok}, year={2020}, month={Sep}, pages={22146–22156} } @article{nguyen_cang_wu_wang_cao_wei_2019, title={Mathematical deep learning for pose and binding affinity prediction and ranking in D3R Grand Challenges}, volume={33}, url={http://dx.doi.org/10.1007/s10822-018-0146-6}, DOI={10.1007/s10822-018-0146-6}, abstractNote={Advanced mathematics, such as multiscale weighted colored subgraph and element specific persistent homology, and machine learning including deep neural networks were integrated to construct mathematical deep learning models for pose and binding affinity prediction and ranking in the last two D3R Grand Challenges in computer-aided drug design and discovery. D3R Grand Challenge 2 focused on the pose prediction, binding affinity ranking and free energy prediction for Farnesoid X receptor ligands. Our models obtained the top place in absolute free energy prediction for free energy set 1 in stage 2. The latest competition, D3R Grand Challenge 3 (GC3), is considered as the most difficult challenge so far. It has five subchallenges involving Cathepsin S and five other kinase targets, namely VEGFR2, JAK2, p38-α, TIE2, and ABL1. There is a total of 26 official competitive tasks for GC3. Our predictions were ranked 1st in 10 out of these 26 tasks.}, number={1}, journal={Journal of Computer-Aided Molecular Design}, publisher={Springer Science and Business Media LLC}, author={Nguyen, Duc Duy and Cang, Zixuan and Wu, Kedi and Wang, Menglun and Cao, Yin and Wei, Guo-Wei}, year={2019}, month={Jan}, pages={71–82} } @article{cang_wei_2018, title={Integration of element specific persistent homology and machine learning for protein‐ligand binding affinity prediction}, url={https://doi.org/10.1002/cnm.2914}, DOI={10.1002/cnm.2914}, abstractNote={Protein‐ligand binding is a fundamental biological process that is paramount to many other biological processes, such as signal transduction, metabolic pathways, enzyme construction, cell secretion, and gene expression. Accurate prediction of protein‐ligand binding affinities is vital to rational drug design and the understanding of protein‐ligand binding and binding induced function. Existing binding affinity prediction methods are inundated with geometric detail and involve excessively high dimensions, which undermines their predictive power for massive binding data. Topology provides the ultimate level of abstraction and thus incurs too much reduction in geometric information. Persistent homology embeds geometric information into topological invariants and bridges the gap between complex geometry and abstract topology. However, it oversimplifies biological information. This work introduces element specific persistent homology (ESPH) or multicomponent persistent homology to retain crucial biological information during topological simplification. The combination of ESPH and machine learning gives rise to a powerful paradigm for macromolecular analysis. Tests on 2 large data sets indicate that the proposed topology‐based machine‐learning paradigm outperforms other existing methods in protein‐ligand binding affinity predictions. ESPH reveals protein‐ligand binding mechanism that can not be attained from other conventional techniques. The present approach reveals that protein‐ligand hydrophobic interactions are extended to 40Å away from the binding site, which has a significant ramification to drug and protein design.}, journal={International Journal for Numerical Methods in Biomedical Engineering}, author={Cang, Zixuan and Wei, Guo‐Wei}, year={2018}, month={Feb} } @article{zhao_cang_tong_wei_2018, title={Protein pocket detection via convex hull surface evolution and associated Reeb graph}, volume={34}, url={http://dx.doi.org/10.1093/bioinformatics/bty598}, DOI={10.1093/bioinformatics/bty598}, abstractNote={Motivation Protein pocket information is invaluable for drug target identification, agonist design, virtual screening and receptor‐ligand binding analysis. A recent study indicates that about half holoproteins can simultaneously bind multiple interacting ligands in a large pocket containing structured sub‐pockets. Although this hierarchical pocket and sub‐pocket structure has a significant impact to multi‐ligand synergistic interactions in the protein binding site, there is no method available for this analysis. This work introduces a computational tool based on differential geometry, algebraic topology and physics‐based simulation to address this pressing issue. Results We propose to detect protein pockets by evolving the convex hull surface inwards until it touches the protein surface everywhere. The governing partial differential equations (PDEs) include the mean curvature flow combined with the eikonal equation commonly used in the fast marching algorithm in the Eulerian representation. The surface evolution induced Morse function and Reeb graph are utilized to characterize the hierarchical pocket and sub‐pocket structure in controllable detail. The proposed method is validated on PDBbind refined sets of 4414 protein‐ligand complexes. Extensive numerical tests indicate that the proposed method not only provides a unique description of pocket‐sub‐pocket relations, but also offers efficient estimations of pocket surface area, pocket volume and pocket depth. Availability and implementation Source code available at https://github.com/rdzhao/ProteinPocketDetection. Webserver available at http://weilab.math.msu.edu/PPD/.}, number={17}, journal={Bioinformatics}, publisher={Oxford University Press (OUP)}, author={Zhao, Rundong and Cang, Zixuan and Tong, Yiying and Wei, Guo-Wei}, year={2018}, month={Sep}, pages={i830–i837} } @article{cang_mu_wei_2018, title={Representability of algebraic topology for biomolecules in machine learning based scoring and virtual screening}, volume={14}, url={https://doi.org/10.1371/journal.pcbi.1005929}, DOI={10.1371/journal.pcbi.1005929}, abstractNote={This work introduces a number of algebraic topology approaches, including multi-component persistent homology, multi-level persistent homology, and electrostatic persistence for the representation, characterization, and description of small molecules and biomolecular complexes. In contrast to the conventional persistent homology, multi-component persistent homology retains critical chemical and biological information during the topological simplification of biomolecular geometric complexity. Multi-level persistent homology enables a tailored topological description of inter- and/or intra-molecular interactions of interest. Electrostatic persistence incorporates partial charge information into topological invariants. These topological methods are paired with Wasserstein distance to characterize similarities between molecules and are further integrated with a variety of machine learning algorithms, including k-nearest neighbors, ensemble of trees, and deep convolutional neural networks, to manifest their descriptive and predictive powers for protein-ligand binding analysis and virtual screening of small molecules. Extensive numerical experiments involving 4,414 protein-ligand complexes from the PDBBind database and 128,374 ligand-target and decoy-target pairs in the DUD database are performed to test respectively the scoring power and the discriminatory power of the proposed topological learning strategies. It is demonstrated that the present topological learning outperforms other existing methods in protein-ligand binding affinity prediction and ligand-decoy discrimination.}, number={1}, journal={PLOS Computational Biology}, publisher={Public Library of Science (PLoS)}, author={Cang, Zixuan and Mu, Lin and Wei, Guo-Wei}, editor={Peng, JianEditor}, year={2018}, month={Jan}, pages={e1005929} } @article{cang_wei_2017, title={Analysis and prediction of protein folding energy changes upon mutation by element specific persistent homology}, volume={7}, url={http://dx.doi.org/10.1093/bioinformatics/btx460}, DOI={10.1093/bioinformatics/btx460}, abstractNote={Motivation Site directed mutagenesis is widely used to understand the structure and function of biomolecules. Computational prediction of mutation impacts on protein stability offers a fast, economical and potentially accurate alternative to laboratory mutagenesis. Most existing methods rely on geometric descriptions, this work introduces a topology based approach to provide an entirely new representation of mutation induced protein stability changes that could not be obtained from conventional techniques. Results Topology based mutation predictor (T‐MP) is introduced to dramatically reduce the geometric complexity and number of degrees of freedom of proteins, while element specific persistent homology is proposed to retain essential biological information. The present approach is found to outperform other existing methods in the predictions of globular protein stability changes upon mutation. A Pearson correlation coefficient of 0.82 with an RMSE of 0.92 kcal/mol is obtained on a test set of 350 mutation samples. For the prediction of membrane protein stability changes upon mutation, the proposed topological approach has a 84% higher Pearson correlation coefficient than the current state‐of‐the‐art empirical methods, achieving a Pearson correlation of 0.57 and an RMSE of 1.09 kcal/mol in a 5‐fold cross validation on a set of 223 membrane protein mutation samples. Availability and implementation http://weilab.math.msu.edu/TML/TML‐MP/ Contact wei@math.msu.edu Supplementary information Supplementary data are available at Bioinformatics online.}, journal={Bioinformatics}, publisher={Oxford University Press (OUP)}, author={Cang, Zixuan and Wei, Guowei}, year={2017}, month={Jul} } @article{cang_wei_2017, title={TopologyNet: Topology based deep convolutional and multi-task neural networks for biomolecular property predictions}, volume={13}, DOI={10.1371/journal.pcbi.1005690}, abstractNote={Although deep learning approaches have had tremendous success in image, video and audio processing, computer vision, and speech recognition, their applications to three-dimensional (3D) biomolecular structural data sets have been hindered by the geometric and biological complexity. To address this problem we introduce the element-specific persistent homology (ESPH) method. ESPH represents 3D complex geometry by one-dimensional (1D) topological invariants and retains important biological information via a multichannel image-like representation. This representation reveals hidden structure-function relationships in biomolecules. We further integrate ESPH and deep convolutional neural networks to construct a multichannel topological neural network (TopologyNet) for the predictions of protein-ligand binding affinities and protein stability changes upon mutation. To overcome the deep learning limitations from small and noisy training sets, we propose a multi-task multichannel topological convolutional neural network (MM-TCNN). We demonstrate that TopologyNet outperforms the latest methods in the prediction of protein-ligand binding affinities, mutation induced globular protein folding free energy changes, and mutation induced membrane protein folding free energy changes. Availability: weilab.math.msu.edu/TDL/}, number={7}, journal={PLOS Computational Biology}, publisher={Public Library of Science (PLoS)}, author={Cang, Zixuan and Wei, Guo-Wei}, editor={Dunbrack, Roland L.Editor}, year={2017}, month={Jul}, pages={e1005690} } @article{cang_mu_wu_opron_xia_wei_2015, title={A topological approach for protein classification}, volume={3}, url={http://dx.doi.org/10.1515/mlbmb-2015-0009}, DOI={10.1515/mlbmb-2015-0009}, abstractNote={Abstract Protein function and dynamics are closely related to its sequence and structure.However, prediction of protein function and dynamics from its sequence and structure is still a fundamental challenge in molecular biology. Protein classification, which is typically done through measuring the similarity between proteins based on protein sequence or physical information, serves as a crucial step toward the understanding of protein function and dynamics. Persistent homology is a new branch of algebraic topology that has found its success in the topological data analysis in a variety of disciplines, including molecular biology. The present work explores the potential of using persistent homology as an independent tool for protein classification. To this end, we propose a molecular topological fingerprint based support vector machine (MTF-SVM) classifier. Specifically,we construct machine learning feature vectors solely fromprotein topological fingerprints,which are topological invariants generated during the filtration process. To validate the presentMTF-SVMapproach, we consider four types of problems. First, we study protein-drug binding by using the M2 channel protein of influenza A virus. We achieve 96% accuracy in discriminating drug bound and unbound M2 channels. Secondly, we examine the use of MTF-SVM for the classification of hemoglobin molecules in their relaxed and taut forms and obtain about 80% accuracy. Thirdly, the identification of all alpha, all beta, and alpha-beta protein domains is carried out using 900 proteins.We have found a 85% success in this identification. Finally, we apply the present technique to 55 classification tasks of protein superfamilies over 1357 samples and 246 tasks over 11944 samples. Average accuracies of 82% and 73% are attained. The present study establishes computational topology as an independent and effective alternative for protein classification.}, number={1}, journal={Computational and Mathematical Biophysics}, publisher={Walter de Gruyter GmbH}, author={Cang, Zixuan and Mu, Lin and Wu, Kedi and Opron, Kristopher and Xia, Kelin and Wei, Guo-Wei}, year={2015}, month={Nov} }