@article{zhang_tang_2024, title={A Theoretical Analysis of DeepWalk and Node2vec for Exact Recovery of Community Structures in Stochastic Blockmodels}, volume={46}, ISSN={["1939-3539"]}, DOI={10.1109/TPAMI.2023.3327631}, abstractNote={Random-walk-based network embedding algorithms like DeepWalk and node2vec are widely used to obtain euclidean representation of the nodes in a network prior to performing downstream inference tasks. However, despite their impressive empirical performance, there is a lack of theoretical results explaining their large-sample behavior. In this paper, we study node2vec and DeepWalk through the perspective of matrix factorization. In particular, we analyze these algorithms in the setting of community detection for stochastic blockmodel graphs (and their degree-corrected variants). By exploiting the row-wise uniform perturbation bound for leading singular vectors, we derive high-probability error bounds between the matrix factorization-based node2vec/DeepWalk embeddings and their true counterparts, uniformly over all node embeddings. Based on strong concentration results, we further show the perfect membership recovery by node2vec/DeepWalk, followed by $K$K-means/medians algorithms. Specifically, as the network becomes sparser, our results guarantee that with large enough window size and vertex number, applying $K$K-means/medians on the matrix factorization-based node2vec embeddings can, with high probability, correctly recover the memberships of all vertices in a network generated from the stochastic blockmodel (or its degree-corrected variants). The theoretical justifications are mirrored in the numerical experiments and real data applications, for both the original node2vec and its matrix factorization variant.}, number={2}, journal={IEEE TRANSACTIONS ON PATTERN ANALYSIS AND MACHINE INTELLIGENCE}, author={Zhang, Yichi and Tang, Minh}, year={2024}, month={Feb}, pages={1065–1078} } @article{du_tang_2023, title={Hypothesis testing for equality of latent positions in random graphs}, volume={29}, ISSN={["1573-9759"]}, DOI={10.3150/22-BEJ1581}, abstractNote={We consider the hypothesis testing problem that two vertices $i$ and $j$ of a generalized random dot product graph have the same latent positions, possibly up to scaling. Special cases of this hypothesis test include testing whether two vertices in a stochastic block model or degree-corrected stochastic block model graph have the same block membership vectors, or testing whether two vertices in a popularity adjusted block model have the same community assignment. We propose several test statistics based on the empirical Mahalanobis distances between the $i$th and $j$th rows of either the adjacency or the normalized Laplacian spectral embedding of the graph. We show that, under mild conditions, these test statistics have limiting chi-square distributions under both the null and local alternative hypothesis, and we derived explicit expressions for the non-centrality parameters under the local alternative. Using these limit results, we address the model selection problems including choosing between the standard stochastic block model and its degree-corrected variant, and choosing between the ER model and stochastic block model. The effectiveness of our proposed tests are illustrated via both simulation studies and real data applications.}, number={4}, journal={BERNOULLI}, author={Du, Xinjie and Tang, Minh}, year={2023}, month={Nov}, pages={3221–3254} } @article{rubin-delanchy_cape_tang_priebe_2022, title={A statistical interpretation of spectral embedding: The generalised random dot product graph}, ISSN={["1467-9868"]}, DOI={10.1111/rssb.12509}, abstractNote={Abstract}, journal={JOURNAL OF THE ROYAL STATISTICAL SOCIETY SERIES B-STATISTICAL METHODOLOGY}, author={Rubin-Delanchy, Patrick and Cape, Joshua and Tang, Minh and Priebe, Carey E.}, year={2022}, month={Jun} } @article{tang_cape_priebe_2022, title={Asymptotically efficient estimators for stochastic blockmodels: The naive MLE, the rank-constrained MLE, and the spectral estimator}, volume={28}, ISSN={["1573-9759"]}, DOI={10.3150/21-BEJ1376}, abstractNote={We establish asymptotic normality results for estimation of the block probability matrix $\mathbf{B}$ in stochastic blockmodel graphs using spectral embedding when the average degrees grows at the rate of $\omega(\sqrt{n})$ in $n$, the number of vertices. As a corollary, we show that when $\mathbf{B}$ is of full-rank, estimates of $\mathbf{B}$ obtained from spectral embedding are asymptotically efficient. When $\mathbf{B}$ is singular the estimates obtained from spectral embedding can have smaller mean square error than those obtained from maximizing the log-likelihood under no rank assumption, and furthermore, can be almost as efficient as the true MLE that assume known $\mathrm{rk}(\mathbf{B})$. Our results indicate, in the context of stochastic blockmodel graphs, that spectral embedding is not just computationally tractable, but that the resulting estimates are also admissible, even when compared to the purportedly optimal but computationally intractable maximum likelihood estimation under no rank assumption.}, number={2}, journal={BERNOULLI}, author={Tang, Minh and Cape, Joshua and Priebe, Carey E.}, year={2022}, month={May}, pages={1049–1073} } @article{athreya_lubberts_priebe_park_tang_lyzinski_kane_lewis_2022, title={Numerical Tolerance for Spectral Decompositions of Random Matrices and Applications to Network Inference}, ISSN={["1537-2715"]}, DOI={10.1080/10618600.2022.2082972}, abstractNote={Abstract We precisely quantify the impact of statistical error in the quality of a numerical approximation to a random matrix eigendecomposition, and under mild conditions, we use this to introduce an optimal numerical tolerance for residual error in spectral decompositions of random matrices. We demonstrate that terminating an eigendecomposition algorithm when the numerical error and statistical error are of the same order results in computational savings with no loss of accuracy. We illustrate the practical consequences of our stopping criterion with an analysis of simulated and real networks. Our theoretical results and real-data examples establish that the tradeoff between statistical and numerical error is of significant importance for subsequent inference.}, journal={JOURNAL OF COMPUTATIONAL AND GRAPHICAL STATISTICS}, author={Athreya, Avanti and Lubberts, Zachary and Priebe, Carey E. and Park, Youngser and Tang, Minh and Lyzinski, Vince and Kane, Michael and Lewis, Bryan W.}, year={2022}, month={Jul} } @article{koo_tang_trosset_2022, title={Popularity Adjusted Block Models are Generalized Random Dot Product Graphs}, ISSN={["1537-2715"]}, DOI={10.1080/10618600.2022.2081576}, abstractNote={Abstract We connect two random graph models, the Popularity Adjusted Block Model (PABM) and the Generalized Random Dot Product Graph (GRDPG), by demonstrating that the PABM is a special case of the GRDPG in which communities correspond to mutually orthogonal subspaces of latent vectors. This insight allows us to construct new algorithms for community detection and parameter estimation for the PABM, as well as improve an existing algorithm that relies on Sparse Subspace Clustering. Using established asymptotic properties of Adjacency Spectral Embedding for the GRDPG, we derive asymptotic properties of these algorithms. In particular, we demonstrate that the absolute number of community detection errors tends to zero as the number of graph vertices tends to infinity. Simulation experiments illustrate these properties. Supplementary materials for this article are available online.}, journal={JOURNAL OF COMPUTATIONAL AND GRAPHICAL STATISTICS}, author={Koo, John and Tang, Minh and Trosset, Michael W.}, year={2022}, month={Jun} } @article{chung_varjavand_arroyo-relion_alyakin_agterberg_tang_priebe_vogelstein_2022, title={Valid two-sample graph testing via optimal transport Procrustes and multiscale graph correlation with applications in connectomics}, volume={11}, ISSN={["2049-1573"]}, DOI={10.1002/sta4.429}, abstractNote={Testing whether two graphs come from the same distribution is of interest in many real‐world scenarios, including brain network analysis. Under the random dot product graph model, the nonparametric hypothesis testing framework consists of embedding the graphs using the adjacency spectral embedding (ASE), followed by aligning the embeddings using the median flip heuristic and finally applying the nonparametric maximum mean discrepancy (MMD) test to obtain a p value. Using synthetic data generated from Drosophila brain networks, we show that the median flip heuristic results in an invalid test and demonstrate that optimal transport Procrustes (OTP) for alignment resolves the invalidity. We further demonstrate that substituting the MMD test with the multiscale graph correlation (MGC) test leads to a more powerful test both in synthetic and in simulated data. Lastly, we apply this powerful test to the right and left hemispheres of the larval Drosophila mushroom body brain networks and conclude that there is not sufficient evidence to reject the null hypothesis that the two hemispheres are equally distributed.}, number={1}, journal={STAT}, author={Chung, Jaewon and Varjavand, Bijan and Arroyo-Relion, Jesus and Alyakin, Anton and Agterberg, Joshua and Tang, Minh and Priebe, Carey E. and Vogelstein, Joshua T.}, year={2022}, month={Dec} } @article{zheng_lyzinski_priebe_tang_2022, title={Vertex Nomination Between Graphs via Spectral Embedding and Quadratic Programming}, ISSN={["1537-2715"]}, DOI={10.1080/10618600.2022.2060238}, abstractNote={Abstract Given a network and a subset of interesting vertices whose identities are only partially known, the vertex nomination problem seeks to rank the remaining vertices in such a way that the interesting vertices are ranked at the top of the list. An important variant of this problem is vertex nomination in the multiple graphs setting. Given two graphs G 1, G 2 with common vertices and a vertex of interest , we wish to rank the vertices of G 2 such that the vertices most similar to x are ranked at the top of the list. The current article addresses this problem and proposes a method that first applies adjacency spectral graph embedding to embed the graphs into a common Euclidean space, and then solves a penalized linear assignment problem to obtain the nomination lists. Since the spectral embedding of the graphs are only unique up to orthogonal transformations, we present two approaches to eliminate this potential nonidentifiability. One approach is based on orthogonal Procrustes and is applicable when there are enough vertices with known correspondence between the two graphs. Another approach uses adaptive point set registration and is applicable when there are few or no vertices with known correspondence. We show that our nomination scheme leads to accurate nomination under a generative model for pairs of random graphs that are approximately low-rank and possibly with pairwise edge correlations. We illustrate our algorithm’s performance through simulation studies on synthetic data as well as analysis of a high-school friendship network and analysis of transition rates between web pages on the Bing search engine. Supplementary materials for this article are available and include R code, data, and an appendix with detailed proofs.}, journal={JOURNAL OF COMPUTATIONAL AND GRAPHICAL STATISTICS}, author={Zheng, Runbing and Lyzinski, Vince and Priebe, Carey E. and Tang, Minh}, year={2022}, month={May} } @article{athreya_cape_tang_2021, title={Eigenvalues of Stochastic Blockmodel Graphs and Random Graphs with Low-Rank Edge Probability Matrices}, ISSN={["0976-8378"]}, DOI={10.1007/s13171-021-00268-x}, abstractNote={We derive the limiting distribution for the outlier eigenvalues of the adjacency matrix for random graphs with independent edges whose edge probability matrices have low-rank structure. We show that when the number of vertices tends to infinity, the leading eigenvalues in magnitude are jointly multivariate Gaussian with bounded covariances. As a special case, this implies a limiting normal distribution for the outlier eigenvalues of stochastic blockmodel graphs and their degree-corrected or mixed-membership variants. Our result extends the classical result of Füredi and Komlós on the fluctuation of the largest eigenvalue for Erdős–Rényi graphs.}, journal={SANKHYA-SERIES A-MATHEMATICAL STATISTICS AND PROBABILITY}, author={Athreya, Avanti and Cape, Joshua and Tang, Minh}, year={2021}, month={Nov} } @article{athreya_tang_park_priebe_2021, title={On Estimation and Inference in Latent Structure Random Graphs}, volume={36}, ISSN={["2168-8745"]}, DOI={10.1214/20-STS787}, abstractNote={We define a latent structure model (LSM) random graph as a random dot product graph (RDPG) in which the latent position distribution incorporates both probabilistic and geometric constraints, delineated by a family of underlying distributions on some fixed Euclidean space, and a structural support submanifold from which the latent positions for the graph are drawn. For a one-dimensional latent structure model with known structural support, we show how spectral estimates of the latent positions of an RDPG can be used for efficient estimation of the paramaters of the LSM. We describe how to estimate or learn the structural support in cases where it is unknown, with an illustrative focus on graphs with latent positions along the Hardy-Weinberg curve. Finally, we use the latent structure model formulation to test bilateral homology in the Drosophila connectome.}, number={1}, journal={STATISTICAL SCIENCE}, author={Athreya, Avanti and Tang, Minh and Park, Youngser and Priebe, Carey E.}, year={2021}, month={Feb}, pages={68–88} } @article{li_tang_charon_priebe_2020, title={Central limit theorems for classical multidimensional scaling}, volume={14}, ISSN={["1935-7524"]}, DOI={10.1214/20-EJS1720}, abstractNote={Classical multidimensional scaling (CMDS) is a widely used method in manifold learning. It takes in a dissimilarity matrix and outputs a coordinate matrix based on a spectral decomposition. However, there are not yet any statistical results characterizing the performance ofCMDS under randomness, such as perturbation analysis when the objects are sampled from a probabilistic model. In this paper, we present such an analysis given that the objects are sampled from a suitable distribution. In particular, we show that the resulting embedding gives rise to a central limit theorem for noisy dissimilarity measurements, and provide compelling simulation and real data illustration of this CLT for CMDS.}, number={1}, journal={ELECTRONIC JOURNAL OF STATISTICS}, author={Li, Gongkai and Tang, Minh and Charon, Nichlas and Priebe, Carey}, year={2020}, pages={2362–2394} } @article{cape_tang_priebe_2019, title={On spectral embedding performance and elucidating network structure in stochastic blockmodel graphs}, volume={7}, ISSN={["2050-1250"]}, DOI={10.1017/nws.2019.23}, abstractNote={Abstract}, number={3}, journal={NETWORK SCIENCE}, author={Cape, Joshua and Tang, Minh and Priebe, Carey E.}, year={2019}, month={Sep}, pages={269–291} }