@article{heavner_chen_gopal_martinsson_2023, title={Efficient algorithms for computing rank‐revealing factorizations on a GPU}, volume={30}, ISSN={1070-5325 1099-1506}, url={http://dx.doi.org/10.1002/nla.2515}, DOI={10.1002/nla.2515}, abstractNote={Abstract}, number={6}, journal={Numerical Linear Algebra with Applications}, publisher={Wiley}, author={Heavner, Nathan and Chen, Chao and Gopal, Abinand and Martinsson, Per‐Gunnar}, year={2023}, month={Jun}, pages={e2515} } @article{chen_biros_2022, title={Overlapping Domain Decomposition Preconditioner for Integral Equations}, volume={44}, ISSN={1064-8275 1095-7197}, url={http://dx.doi.org/10.1137/21m1442917}, DOI={10.1137/21m1442917}, abstractNote={. The discretization of certain integral equations, e.g., the first-kind Fredholm equa- tion of Laplace’s equation, leads to symmetric positive-definite linear systems, where the coefficient matrix is dense and often ill-conditioned. We introduce a new preconditioner based on a novel overlapping domain decomposition that can be combined efficiently with existing fast direct solvers. Empirically, we observe that the condition number of the preconditioned system is O (1), independent of the problem size. Our domain decomposition is designed so that we can construct approximate factorizations of subproblems efficiently. In particular, we apply the recursive skeletonization algo- rithm to subproblems associated with every subdomain. We present numerical results on problem sizes up to 16384 2 in 2D and 256 3 in 3D, which were solved in less than 16 hours and three hours, respectively, on an Intel Xeon Platinum 8280M.}, number={6}, journal={SIAM Journal on Scientific Computing}, publisher={Society for Industrial & Applied Mathematics (SIAM)}, author={Chen, Chao and Biros, George}, year={2022}, month={Nov}, pages={A3617–A3644} } @inproceedings{chen_martinsson_2022, title={Solving Linear Systems on a GPU with Hierarchically Off-Diagonal Low-Rank Approximations}, url={http://dx.doi.org/10.1109/sc41404.2022.00089}, DOI={10.1109/sc41404.2022.00089}, abstractNote={We are interested in solving linear systems arising from three applications: (1) kernel methods in machine learning, (2) discretization of boundary integral equations from mathematical physics, and (3) Schur complements formed in the factorization of many large sparse matrices. The coefficient matrices are often data-sparse in the sense that their off-diagonal blocks have low numerical ranks; specifically, we focus on “hierarchically off-diagonal low-rank (HODLR)” matrices. We introduce algorithms for factorizing HODLR matrices and for applying the factorizations on a GPU. The algorithms leverage the efficiency of batched dense linear algebra, and they scale nearly linearly with the matrix size when the numerical ranks are fixed. The accuracy of the HODLR-matrix approximation is a tunable parameter, so we can construct high-accuracy fast direct solvers or low-accuracy robust preconditioners. Numerical results show that we can solve problems with several millions of unknowns in a couple of seconds on a single GPU.}, booktitle={SC22: International Conference for High Performance Computing, Networking, Storage and Analysis}, publisher={IEEE}, author={Chen, Chao and Martinsson, Per-Gunnar}, year={2022}, month={Nov}, pages={1–15} } @article{chen_reiz_yu_bungartz_biros_2021, title={Fast Approximation of the Gauss--Newton Hessian Matrix for the Multilayer Perceptron}, volume={42}, ISSN={0895-4798 1095-7162}, url={http://dx.doi.org/10.1137/19m129961x}, DOI={10.1137/19m129961x}, abstractNote={We introduce a fast algorithm for entry-wise evaluation of the Gauss-Newton Hessian (GNH) matrix for the fully-connected feed-forward neural network. The algorithm has a precomputation step and a sampling step. While it generally requires $O(Nn)$ work to compute an entry (and the entire column) in the GNH matrix for a neural network with $N$ parameters and $n$ data points, our fast sampling algorithm reduces the cost to $O(n+d/\epsilon^2)$ work, where $d$ is the output dimension of the network and $\epsilon$ is a prescribed accuracy (independent of $N$). One application of our algorithm is constructing the hierarchical-matrix (H-matrix) approximation of the GNH matrix for solving linear systems and eigenvalue problems. It generally requires $O(N^2)$ memory and $O(N^3)$ work to store and factorize the GNH matrix, respectively. The H-matrix approximation requires only $O(N r_o)$ memory footprint and $O(N r_o^2)$ work to be factorized, where $r_o \ll N$ is the maximum rank of off-diagonal blocks in the GNH matrix. We demonstrate the performance of our fast algorithm and the H-matrix approximation on classification and autoencoder neural networks.}, number={1}, journal={SIAM Journal on Matrix Analysis and Applications}, publisher={Society for Industrial & Applied Mathematics (SIAM)}, author={Chen, Chao and Reiz, Severin and Yu, Chenhan D. and Bungartz, Hans-Joachim and Biros, George}, year={2021}, month={Jan}, pages={165–184} } @article{wang_chen_lee_darve_2021, title={PBBFMM3D: A parallel black-box algorithm for kernel matrix-vector multiplication}, volume={154}, ISSN={0743-7315}, url={http://dx.doi.org/10.1016/j.jpdc.2021.04.005}, DOI={10.1016/j.jpdc.2021.04.005}, abstractNote={Kernel matrix-vector product is ubiquitous in many science and engineering applications. However, a naive method requires $O(N^2)$ operations, which becomes prohibitive for large-scale problems. We introduce a parallel method that provably requires $O(N)$ operations to reduce the computation cost. The distinct feature of our method is that it requires only the ability to evaluate the kernel function, offering a black-box interface to users. Our parallel approach targets multi-core shared-memory machines and is implemented using OpenMP. Numerical results demonstrate up to $19\times$ speedup on 32 cores. We also present a real-world application in geostatistics, where our parallel method was used to deliver fast principle component analysis of covariance matrices.}, journal={Journal of Parallel and Distributed Computing}, publisher={Elsevier BV}, author={Wang, Ruoxi and Chen, Chao and Lee, Jonghyun and Darve, Eric}, year={2021}, month={Aug}, pages={64–73} } @article{chen_liang_biros_2021, title={RCHOL: Randomized Cholesky Factorization for Solving SDD Linear Systems}, volume={43}, ISSN={1064-8275 1095-7197}, url={http://dx.doi.org/10.1137/20m1380624}, DOI={10.1137/20m1380624}, abstractNote={We introduce a randomized algorithm, namely {\tt rchol}, to construct an approximate Cholesky factorization for a given sparse Laplacian matrix (a.k.a., graph Laplacian). The (exact) Cholesky factorization for the matrix introduces a clique in the associated graph after eliminating every row/column. By randomization, {\tt rchol} samples a subset of the edges in the clique. We prove {\tt rchol} is breakdown free and apply it to solving linear systems with symmetric diagonally-dominant matrices. In addition, we parallelize {\tt rchol} based on the nested dissection ordering for shared-memory machines. Numerical experiments demonstrated the robustness and the scalability of {\tt rchol}. For example, our parallel code scaled up to 64 threads on a single node for solving the 3D Poisson equation, discretized with the 7-point stencil on a $1024\times 1024 \times 1024$ grid, or \textbf{one billion} unknowns.}, number={6}, journal={SIAM Journal on Scientific Computing}, publisher={Society for Industrial & Applied Mathematics (SIAM)}, author={Chen, Chao and Liang, Tianyu and Biros, George}, year={2021}, month={Jan}, pages={C411–C438} } @inproceedings{boman_cambier_chen_darve_rajamanickam_tuminaro_2020, title={A preconditioner based on sparsified nested dissection and low-rank approximation}, booktitle={XXI Householder Symposium on Numerical Linear Algebra}, author={Boman, Erik G and Cambier, Leopold and Chen, Chao and Darve, Eric and Rajamanickam, Siva and Tuminaro, Ray S}, year={2020}, pages={128} } @article{cambier_chen_boman_rajamanickam_tuminaro_darve_2020, title={An Algebraic Sparsified Nested Dissection Algorithm Using Low-Rank Approximations}, volume={41}, ISSN={0895-4798 1095-7162}, url={http://dx.doi.org/10.1137/19m123806x}, DOI={10.1137/19m123806X}, abstractNote={We propose a new algorithm for the fast solution of large, sparse, symmetric positive-definite linear systems, spaND -- sparsified Nested Dissection. It is based on nested dissection, sparsification and low-rank compression. After eliminating all interiors at a given level of the elimination tree, the algorithm sparsifies all separators corresponding to the interiors. This operation reduces the size of the separators by eliminating some degrees of freedom but without introducing any fill-in. This is done at the expense of a small and controllable approximation error. The result is an approximate factorization that can be used as an efficient preconditioner. We then perform several numerical experiments to evaluate this algorithm. We demonstrate that a version using orthogonal factorization and block-diagonal scaling takes less CG iterations to converge than previous similar algorithms on various kinds of problems. Furthermore, this algorithm is provably guaranteed to never break down and the matrix stays symmetric positive-definite throughout the process. We evaluate the algorithm on some large problems show it exhibits near-linear scaling. The factorization time is roughly O(N) and the number of iterations grows slowly with N.}, number={2}, journal={SIAM Journal on Matrix Analysis and Applications}, publisher={Society for Industrial & Applied Mathematics (SIAM)}, author={Cambier, Léopold and Chen, Chao and Boman, Erik G. and Rajamanickam, Sivasankaran and Tuminaro, Raymond S. and Darve, Eric}, year={2020}, month={Jan}, pages={715–746} } @article{takahashi_chen_darve_2020, title={Parallelization of the inverse fast multipole method with an application to boundary element method}, volume={247}, ISSN={0010-4655}, url={http://dx.doi.org/10.1016/j.cpc.2019.106975}, DOI={10.1016/j.cpc.2019.106975}, abstractNote={We present an algorithm to parallelize the inverse fast multipole method (IFMM), which is an approximate direct solver for dense linear systems. The parallel scheme is based on a greedy coloring algorithm, where two nodes in the hierarchy with the same color are separated by at least σ nodes. We proved that when σ≥6, the workload associated with one color is embarrassingly parallel. However, the number of nodes in a group (color) may be small when σ=6. Therefore, we also explored σ=3, where a small fraction of the algorithm needs to be serialized, and the overall parallel efficiency was improved. We implemented the parallel IFMM using OpenMP for shared-memory machines. Successively, we applied it to a fast-multipole accelerated boundary element method (FMBEM) as a preconditioner, and compared its efficiency with (a) the original IFMM parallelized by linking a multi-threaded linear algebra library and (b) the commonly used parallel block-diagonal preconditioner. Our results showed that our parallel IFMM achieved at most 4× and 11× speedups over the reference method (a) and (b), respectively, in realistic examples involving more than one million variables.}, journal={Computer Physics Communications}, publisher={Elsevier BV}, author={Takahashi, Toru and Chen, Chao and Darve, Eric}, year={2020}, month={Feb}, pages={106975} } @book{lee_chen_toru_darve_yoon_2020, title={Scalable spatio-temporal modeling using a fast multipole method for 3D tracer concentration breakthrough data with magnetic resonance imaging.}, institution={Sandia National Lab.(SNL-NM), Albuquerque, NM (United States)}, author={Lee, Jonghyun and Chen, Chao and Toru, Takahashi and Darve, Eric and Yoon, Hongkyu}, year={2020} } @article{chen_cambier_boman_rajamanickam_tuminaro_darve_2019, title={A robust hierarchical solver for ill-conditioned systems with applications to ice sheet modeling}, volume={396}, ISSN={0021-9991}, url={http://dx.doi.org/10.1016/j.jcp.2019.07.024}, DOI={10.1016/j.jcp.2019.07.024}, abstractNote={A hierarchical solver is proposed for solving sparse ill-conditioned linear systems in parallel. The solver is based on a modification of the LoRaSp method, but employs a deferred-compression technique, which provably reduces the approximation error and significantly improves efficiency. Moreover, the deferred-compression technique introduces minimal overhead and does not affect parallelism. As a result, the new solver achieves linear computational complexity under mild assumptions and excellent parallel scalability. To demonstrate the performance of the new solver, we focus on applying it to solve sparse linear systems arising from ice sheet modeling. The strong anisotropic phenomena associated with the thin structure of ice sheets creates serious challenges for existing solvers. To address the anisotropy, we additionally developed a customized partitioning scheme for the solver, which captures the strong-coupling direction accurately. In general, the partitioning can be computed algebraically with existing software packages, and thus the new solver is generalizable for solving other sparse linear systems. Our results show that ice sheet problems of about 300 million degrees of freedom have been solved in just a few minutes using a thousand processors.}, journal={Journal of Computational Physics}, publisher={Elsevier BV}, author={Chen, Chao and Cambier, Leopold and Boman, Erik G. and Rajamanickam, Sivasankaran and Tuminaro, Raymond S. and Darve, Eric}, year={2019}, month={Nov}, pages={819–836} } @inproceedings{chen_reiz_yu_bungartz_biros_2019, title={H-matrix approximation of the Gauss-Newton Hessian matrix for the multilayer perceptron}, booktitle={33rd Conference on Neural Information Processing Systems (NeurIPS 2019)}, author={Chen, Chao and Reiz, Severin and Yu, Chenhan and Bungartz, Hans-Joachim and Biros, George}, year={2019} } @book{cambier_chen_boman_rajamanickam_tuminaro_darve_2019, title={SpaND: An Algebraic Sparsified Nested Dissection Algorithm Using Low-Rank Approximations.}, institution={Sandia National Lab.(SNL-NM), Albuquerque, NM (United States); Sandia …}, author={Cambier, Leopold and Chen, Chao and Boman, Erik Gunnar and Rajamanickam, Sivasankaran and Tuminaro, Raymond S and Darve, Eric}, year={2019} } @book{boman_chen_darve_rajamanickam_tuminaro_2018, title={A Hierarchical Low-Rank Solver for Sparse Linear Systems and Its Variations.}, institution={Sandia National Lab.(SNL-NM), Albuquerque, NM (United States)}, author={Boman, Erik Gunnar and Chen, Chao and Darve, Eric and Rajamanickam, Sivasankaran and Tuminaro, Raymond S}, year={2018} } @article{chen_pouransari_rajamanickam_boman_darve_2018, title={A distributed-memory hierarchical solver for general sparse linear systems}, volume={74}, ISSN={0167-8191}, url={http://dx.doi.org/10.1016/j.parco.2017.12.004}, DOI={10.1016/j.parco.2017.12.004}, abstractNote={We present a parallel hierarchical solver for general sparse linear systems on distributed-memory machines. For large-scale problems, this fully algebraic algorithm is faster and more memory-efficient than sparse direct solvers because it exploits the low-rank structure of fill-in blocks. Depending on the accuracy of low-rank approximations, the hierarchical solver can be used either as a direct solver or as a preconditioner. The parallel algorithm is based on data decomposition and requires only local communication for updating boundary data on every processor. Moreover, the computation-to-communication ratio of the parallel algorithm is approximately the volume-to-surface-area ratio of the subdomain owned by every processor. We present various numerical results to demonstrate the versatility and scalability of the parallel algorithm.}, journal={Parallel Computing}, publisher={Elsevier BV}, author={Chen, Chao and Pouransari, Hadi and Rajamanickam, Sivasankaran and Boman, Erik G. and Darve, Eric}, year={2018}, month={May}, pages={49–64} } @book{chen_tuminaro_rajamanickam_boman_darve_2018, title={A hierarchical solver for extruded meshes with applications to ice sheet modeling}, number={SAND2018-2780O}, journal={Center for Computing Research Summer Proceedings 2017}, institution={Sandia National Laboratories}, author={Chen, Chao and Tuminaro, Raymond and Rajamanickam, Sivasankaran and Boman, Erik G. and Darve, Eric}, editor={Baczewski, A.D. and Parks, M.L.Editors}, year={2018}, pages={3–18} } @article{chen_tuminaro_rajamanickam_boman_darve_2018, title={A hierarchical solver for extruded meshes with applications to ice sheet modeling}, number={SAND2018-2780O}, journal={Center for Computing Research Summer Proceedings 2017}, publisher={Sandia National Laboratories}, author={Chen, Chao and Tuminaro, Raymond and Rajamanickam, Sivasankaran and Boman, Erik G. and Darve, Eric}, editor={Baczewski, A.D. and Parks, M.L.EditorsEditors}, year={2018}, pages={3–18} } @article{chen_aubry_oppelstrup_arsenlis_darve_2018, title={Fast algorithms for evaluating the stress field of dislocation lines in anisotropic elastic media}, volume={26}, ISSN={0965-0393 1361-651X}, url={http://dx.doi.org/10.1088/1361-651x/aab7bb}, DOI={10.1088/1361-651x/aab7bb}, abstractNote={In dislocation dynamics (DD) simulations, the most computationally intensive step is the evaluation of the elastic interaction forces among dislocation ensembles. Because the pair-wise interaction between dislocations is long-range, this force calculation step can be significantly accelerated by the fast multipole method (FMM). We implemented and compared four different methods in isotropic and anisotropic elastic media: one based on the Taylor series expansion (Taylor FMM), one based on the spherical harmonics expansion (Spherical FMM), one kernel-independent method based on the Chebyshev interpolation (Chebyshev FMM), and a new kernel-independent method that we call the Lagrange FMM. The Taylor FMM is an existing method, used in ParaDiS, one of the most popular DD simulation softwares. The Spherical FMM employs a more compact multipole representation than the Taylor FMM does and is thus more efficient. However, both the Taylor FMM and the Spherical FMM are difficult to derive in anisotropic elastic media because the interaction force is complex and has no closed analytical formula. The Chebyshev FMM requires only being able to evaluate the interaction between dislocations and thus can be applied easily in anisotropic elastic media. But it has a relatively large memory footprint, which limits its usage. The Lagrange FMM was designed to be a memory-efficient black-box method. Various numerical experiments are presented to demonstrate the convergence and the scalability of the four methods.}, number={4}, journal={Modelling and Simulation in Materials Science and Engineering}, publisher={IOP Publishing}, author={Chen, C and Aubry, S and Oppelstrup, T and Arsenlis, A and Darve, E}, year={2018}, month={Apr}, pages={045007} } @phdthesis{chen_2018, title={Parallel Hierarchical Linear Solvers and Fast Multipole Methods with Applications}, school={Stanford University}, author={Chen, Chao}, year={2018} } @book{boman_chen_rajamanickam_2018, title={Scheduling Parallel Tasks using Graph Coloring.}, institution={Sandia National Lab.(SNL-NM), Albuquerque, NM (United States)}, author={Boman, Erik Gunnar and Chen, Chao and Rajamanickam, Sivasankaran}, year={2018} } @book{boman_chen_darve_rajamanickam_tuminaro_2017, title={A Hierarchical Low-Rank Solver for Large Sparse Linear Systems.}, institution={Sandia National Lab.(SNL-NM), Albuquerque, NM (United States)}, author={Boman, Erik G and Chen, Chao and Darve, Eric and Rajamanickam, Sivasankaran and Tuminaro, Raymond S}, year={2017} } @book{boman_chen_darve_rajamanickam_tuminaro_2017, title={A Parallel Hierarchical Low-Rank Solver for General Sparse Matrices.}, institution={Sandia National Lab.(SNL-NM), Albuquerque, NM (United States)}, author={Boman, Erik G and Chen, Chao and Darve, Eric and Rajamanickam, Sivasankaran and Tuminaro, Raymond S}, year={2017} } @book{boman_chen_darve_rajamanickam_tuminaro_2017, title={Hierarchical Matrices and Low-Rank Methods for Extreme-Scale Solvers.}, institution={Sandia National Lab.(SNL-NM), Albuquerque, NM (United States); Sandia …}, author={Boman, Erik G and Chen, Chao and Darve, Eric and Rajamanickam, Sivasankaran and Tuminaro, Raymond S}, year={2017} } @article{chen_rajamanickam_boman_darve_2016, title={Parallel hierarchical solver for elliptic partial differential equations}, journal={CCR}, author={Chen, Chao and RAJAMANICKAM, SIVASANKARAN and Boman, Erik G and Darve, Eric}, year={2016}, pages={3} } @inproceedings{coulier_chen_pouransari_darve_2016, title={The Inverse Fast Multipole Method as an Efficient Preconditioner for Dense Linear Systems}, booktitle={Conference on Parallel Processing for Scientific Computing, Date: 2016/04/12-2016/04/15, Location: Paris, France}, author={Coulier, P and Chen, C and Pouransari, H and Darve, E}, year={2016} }