@article{miao_calhoun_ge_li_2023, title={Performance Implication of Tensor Irregularity and Optimization for Distributed Tensor Decomposition}, volume={10}, ISSN={["2329-4957"]}, DOI={10.1145/3580315}, abstractNote={ Tensors are used by a wide variety of applications to represent multi-dimensional data; tensor decompositions are a class of methods for latent data analytics, data compression, and so on. Many of these applications generate large tensors with irregular dimension sizes and nonzero distribution. CANDECOMP/PARAFAC decomposition ( Cpd ) is a popular low-rank tensor decomposition for discovering latent features. The increasing overhead on memory and execution time of Cpd for large tensors requires distributed memory implementations as the only feasible solution. The sparsity and irregularity of tensors hinder the improvement of performance and scalability of distributed memory implementations. While previous works have been proved successful in Cpd for tensors with relatively regular dimension sizes and nonzero distribution, they either deliver unsatisfactory performance and scalability for irregular tensors or require significant time overhead in preprocessing. In this work, we focus on medium-grained tensor distribution to address their limitation for irregular tensors. We first thoroughly investigate through theoretical and experimental analysis. We disclose that the main cause of poor Cpd performance and scalability is the imbalance of multiple types of computations and communications and their tradeoffs; and sparsity and irregularity make it challenging to achieve their balances and tradeoffs. Irregularity of a sparse tensor is categorized based on two aspects: very different dimension sizes and a non-uniform nonzero distribution. Typically, focusing on optimizing one type of load imbalance causes other ones more severe for irregular tensors. To address such challenges, we propose irregularity-aware distributed Cpd that leverages the sparsity and irregularity information to identify the best tradeoff between different imbalances with low time overhead. We materialize the idea with two optimization methods: the prediction-based grid configuration and matrix-oriented distribution policy, where the former forms the global balance among computations and communications, and the latter further adjusts the balances among computations. The experimental results show that our proposed irregularity-aware distributed Cpd is more scalable and outperforms the medium- and fine-grained distributed implementations by up to 4.4 × and 11.4 × on 1,536 processors, respectively. Our optimizations support different sparse tensor formats, such as compressed sparse fiber (CSF), coordinate (COO), and Hierarchical Coordinate (HiCOO), and gain good scalability for all of them. }, number={2}, journal={ACM TRANSACTIONS ON PARALLEL COMPUTING}, author={Miao, Zheng and Calhoun, Jon C. and Ge, Rong and Li, Jiajia}, year={2023}, month={Jun} } @article{shivakumar_li_kannan_aluru_2023, title={Sparse Symmetric Format for Tucker Decomposition}, volume={34}, ISSN={["1558-2183"]}, DOI={10.1109/TPDS.2023.3263124}, abstractNote={Tensor-based methods are receiving renewed attention in recent years due to their prevalence in diverse real-world applications. There is considerable literature on tensor representations and algorithms for tensor decompositions, both for dense and sparse tensors. Many applications in hypergraph analytics, machine learning, psychometry, and signal processing result in tensors that are both sparse and symmetric, making them an important class for further study. Similar to the critical Tensor Times Matrix chain operation (TTMc) in general sparse tensors, the Sparse Symmetric Tensor Times Same Matrix chain (S$^{3}$3 TTMc) operation is compute and memory intensive due to high tensor order and the associated factorial explosion in the number of non-zeros. We present the novel Compressed Sparse Symmetric (CSS) format for sparse symmetric tensors, along with an efficient parallel algorithm for the S$^{3}$3 TTMc operation. We theoretically establish that S$^{3}$3 TTMc on CSS achieves a better memory versus run-time trade-off compared to state-of-the-art implementations, and visualize the variation of the performance gap over the parameter space. We demonstrate experimental findings that confirm these results and achieve up to $2.72 \times$2.72× speedup on synthetic and real datasets. The scaling of the algorithm on different test architectures is also showcased to highlight the effect of machine characteristics on algorithm performance.}, number={6}, journal={IEEE TRANSACTIONS ON PARALLEL AND DISTRIBUTED SYSTEMS}, author={Shivakumar, Shruti and Li, Jiajia and Kannan, Ramakrishnan and Aluru, Srinivas}, year={2023}, month={Jun}, pages={1743–1756} } @article{du_li_wang_li_tan_sun_2022, title={AlphaSparse: Generating High Performance SpMV Codes Directly from Sparse Matrices}, ISSN={["2167-4329"]}, DOI={10.1109/SC41404.2022.00071}, abstractNote={Sparse Matrix-Vector multiplication (SpMV) is an essential computational kernel in many application scenarios. Tens of sparse matrix formats and implementations have been proposed to compress the memory storage and speed up SpMV performance. We develop AlphaSparse, a superset of all existing works that goes beyond the scope of human-designed format(s) and implementation(s). AlphaSparse automatically creates novel machine-designed formats and SpMV kernel implementations en-tirely from the knowledge of input sparsity patterns and hard-ware architectures. Based on our proposed Operator Graph that expresses the path of SpMV format and kernel design, AlphaS-parse consists of three main components: Designer, Format & Kernel Generator, and Search Engine. It takes an arbitrary sparse matrix as input while outputs the performance machine-designed format and SpMV implementation. By extensively evaluating 843 matrices from SuiteSparse Matrix Collection, AlphaSparse achieves significant performance improvement by 3.2 × on average compared to five state-of-the-art artificial formats and 1.5 × on average (up to 2.7×) over the up-to-date implementation of traditional auto-tuning philosophy.}, journal={SC22: INTERNATIONAL CONFERENCE FOR HIGH PERFORMANCE COMPUTING, NETWORKING, STORAGE AND ANALYSIS}, author={Du, Zhen and Li, Jiajia and Wang, Yinshan and Li, Xueqi and Tan, Guangming and Sun, Ninghui}, year={2022} } @article{miao_li_calhoun_ge_2022, title={BALA-CPD: BALanced and Asynchronous Distributed Tensor Decomposition}, ISSN={["1552-5244"]}, DOI={10.1109/CLUSTER51413.2022.00054}, abstractNote={Tensor decomposition is widely used in machine learning, recommendation systems, and social networks. Large real-world tensors require parallel algorithms running on distributed memory systems. Parallel algorithms suffer two major performance bottlenecks: load imbalance and communication cost, which are difficult to overcome due to the inherent tradeoff among the multiple types of computations and communications, especially for irregular sparse tensors. Previous work predominately focuses on balancing the load within the tensor-related computation, resulting in imbalance for multiple matrix-only computations and increased communication costs. It also extensively uses collective communication operations and bulk-synchronous computations by interleaving stages of global communication and stages of local computation, failing to hide the communication cost. In this paper, we present a novel algorithm BALA-CPD, which achieves the best overall workload balance, and effectively overlaps communication and computation for the popular distributed Canonical Polyadic Decomposition (CPD) algorithms. BALA-CPD uses a workload and data partition scheme that prioritizes the load balance for all the matrix-only computations and all the communications. When necessary, BALA-CPD adjusts to mitigate the load imbalance for the tensor-related computation. Departing from the bulk-synchronous approaches, BALA-CPD breaks down computation and communication in consecutive stages, and masks the communication costs by a combination of one-sided asynchronous communication and a fine-grained interleaving of communication and computation. We implement BALA-CPD and evaluate it on a 64-node cluster with 1280 processors. Experimental results show BALA-CPD is scalable and outperforms the state-of-the-art distributed implementations by up to 1.8× on 1280 processors.}, journal={2022 IEEE INTERNATIONAL CONFERENCE ON CLUSTER COMPUTING (CLUSTER 2022)}, author={Miao, Zheng and Li, Jiajia and Calhoun, Jon C. and Ge, Rong}, year={2022}, pages={440–450} } @article{di napoli_bientinesi_li_uschmajew_2022, title={Editorial: High-performance tensor computations in scientific computing and data science}, volume={8}, ISSN={["2297-4687"]}, DOI={10.3389/fams.2022.1038885}, abstractNote={COPYRIGHT © 2022 Di Napoli, Bientinesi, Li and Uschmajew. This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms. Editorial: High-performance tensor computations in scientific computing and data science}, journal={FRONTIERS IN APPLIED MATHEMATICS AND STATISTICS}, author={Di Napoli, Edoardo and Bientinesi, Paolo and Li, Jiajia and Uschmajew, Andre}, year={2022}, month={Sep} }