@article{shin_darbon_karniadakis_2023, title={Accelerating gradient descent and Adam via fractional gradients}, volume={161}, url={http://dx.doi.org/10.1016/j.neunet.2023.01.002}, DOI={10.1016/j.neunet.2023.01.002}, abstractNote={We propose a class of novel fractional-order optimization algorithms. We define a fractional-order gradient via the Caputo fractional derivatives that generalizes integer-order gradient. We refer it to as the Caputo fractional-based gradient, and develop an efficient implementation to compute it. A general class of fractional-order optimization methods is then obtained by replacing integer-order gradients with the Caputo fractional-based gradients. To give concrete algorithms, we consider gradient descent (GD) and Adam, and extend them to the Caputo fractional GD (CfGD) and the Caputo fractional Adam (CfAdam). We demonstrate the superiority of CfGD and CfAdam on several large scale optimization problems that arise from scientific machine learning applications, such as ill-conditioned least squares problem on real-world data and the training of neural networks involving non-convex objective functions. Numerical examples show that both CfGD and CfAdam result in acceleration over GD and Adam, respectively. We also derive error bounds of CfGD for quadratic functions, which further indicate that CfGD could mitigate the dependence on the condition number in the rate of convergence and results in significant acceleration over GD.}, journal={Neural Networks}, publisher={Elsevier BV}, author={Shin, Yeonjong and Darbon, Jérôme and Karniadakis, George Em}, year={2023}, month={Apr}, pages={185–201} } @article{shin_zhang_karniadakis_2023, title={ERROR ESTIMATES OF RESIDUAL MINIMIZATION USING NEURAL NETWORKS FOR LINEAR PDES}, volume={4}, ISSN={2689-3967}, url={http://dx.doi.org/10.1615/jmachlearnmodelcomput.2023050411}, DOI={10.1615/jmachlearnmodelcomput.2023050411}, abstractNote={We propose an abstract framework for analyzing the convergence of least-squares methods based on residual minimization when feasible solutions are neural networks. With the norm relations and compactness arguments, we derive error estimates for both continuous and discrete formulations of residual minimization in strong and weak forms. The formulations cover recently developed physicsinformed neural networks based on strong and variational formulations. }, number={4}, journal={Journal of Machine Learning for Modeling and Computing}, publisher={Begell House}, author={Shin, Yeonjong and Zhang, Zhongqiang and Karniadakis, George Em}, year={2023}, pages={73–101} } @article{ainsworth_shin_2022, title={Active Neuron Least Squares: A training method for multivariate rectified neural networks}, volume={44}, url={http://dx.doi.org/10.1137/21m1460764}, DOI={10.1137/21m1460764}, abstractNote={We propose a novel training method for multivariate two-layer rectified neural networks. The core mechanism is the augmentation of standard gradient descent direction by the inclusion of search vectors which are chosen to explicitly adjust the activation patterns of the neurons. Active neuron least squares (ANLS) proceeds iteratively with each iteration consisting of three steps: (a) generation of search vectors (including ones designed to change activation patterns), (b) identification of the candidate weights, and (c) a decision on which candidate weights are selected for update. We develop stable and efficient procedures for implementing the method. Numerical examples are provided that demonstrate the effectiveness of ANLS compared with existing popular approaches on various learning tasks ranging from function approximation to solving PDEs.}, number={4}, journal={SIAM Journal on Scientific Computing}, publisher={Society for Industrial & Applied Mathematics (SIAM)}, author={Ainsworth, Mark and Shin, Yeonjong}, year={2022}, month={Aug}, pages={A2253–A2275} } @article{deng_shin_lu_zhang_karniadakis_2022, title={Approximation rates of DeepONets for learning operators arising from advection–diffusion equations}, volume={153}, url={http://dx.doi.org/10.1016/j.neunet.2022.06.019}, DOI={10.1016/j.neunet.2022.06.019}, abstractNote={We present the analysis of approximation rates of operator learning in Chen and Chen (1995) and Lu et al. (2021), where continuous operators are approximated by a sum of products of branch and trunk networks. In this work, we consider the rates of learning solution operators from both linear and nonlinear advection-diffusion equations with or without reaction. We find that the approximation rates depend on the architecture of branch networks as well as the smoothness of inputs and outputs of solution operators.}, journal={Neural Networks}, publisher={Elsevier BV}, author={Deng, Beichuan and Shin, Yeonjong and Lu, Lu and zhang and Karniadakis, George}, year={2022}, month={Sep}, pages={411–426} } @article{jagtap_shin_kawaguchi_karniadakis_2022, title={Deep Kronecker neural networks: A general framework for neural networks with adaptive activation functions}, volume={468}, url={http://dx.doi.org/10.1016/j.neucom.2021.10.036}, DOI={10.1016/j.neucom.2021.10.036}, abstractNote={We propose a new type of neural networks, Kronecker neural networks (KNNs), that form a general framework for neural networks with adaptive activation functions. KNNs employ the Kronecker product, which provides an efficient way of constructing a very wide network while keeping the number of parameters low. Our theoretical analysis reveals that under suitable conditions, KNNs induce a faster decay of the loss than that by the feed-forward networks. This is also empirically verified through a set of computational examples. Furthermore, under certain technical assumptions, we establish global convergence of gradient descent for KNNs. As a specific case, we propose the Rowdy activation function that is designed to get rid of any saturation region by injecting sinusoidal fluctuations, which include trainable parameters. The proposed Rowdy activation function can be employed in any neural network architecture like feed-forward neural networks, Recurrent neural networks, Convolutional neural networks etc. The effectiveness of KNNs with Rowdy activation is demonstrated through various computational experiments including function approximation using feed-forward neural networks, solution inference of partial differential equations using the physics-informed neural networks, and standard deep learning benchmark problems using convolutional and fully-connected neural networks.}, journal={Neurocomputing}, publisher={Elsevier BV}, author={Jagtap, Ameya D. and Shin, Yeonjong and Kawaguchi, Kenji and Karniadakis, George Em}, year={2022}, month={Jan}, pages={165–180} } @article{shin_2022, title={Effects of depth, width, and initialization: A convergence analysis of layer-wise training for deep linear neural networks}, volume={20}, url={http://dx.doi.org/10.1142/s0219530521500263}, DOI={10.1142/s0219530521500263}, abstractNote={ Deep neural networks have been used in various machine learning applications and achieved tremendous empirical successes. However, training deep neural networks is a challenging task. Many alternatives have been proposed in place of end-to-end back-propagation. Layer-wise training is one of them, which trains a single layer at a time, rather than trains the whole layers simultaneously. In this paper, we study a layer-wise training using a block coordinate gradient descent (BCGD) for deep linear networks. We establish a general convergence analysis of BCGD and found the optimal learning rate, which results in the fastest decrease in the loss. We identify the effects of depth, width, and initialization. When the orthogonal-like initialization is employed, we show that the width of intermediate layers plays no role in gradient-based training beyond a certain threshold. Besides, we found that the use of deep networks could drastically accelerate convergence when it is compared to those of a depth 1 network, even when the computational cost is considered. Numerical examples are provided to justify our theoretical findings and demonstrate the performance of layer-wise training by BCGD. }, number={01}, journal={Analysis and Applications}, publisher={World Scientific Pub Co Pte Ltd}, author={Shin, Yeonjong}, year={2022}, month={Jan}, pages={73–119} } @article{zhang_shin_karniadakis_2022, title={GFINNs: GENERIC formalism informed neural networks for deterministic and stochastic dynamical systems}, volume={380}, ISSN={1364-503X 1471-2962}, url={http://dx.doi.org/10.1098/rsta.2021.0207}, DOI={10.1098/rsta.2021.0207}, abstractNote={We propose the GENERIC formalism informed neural networks (GFINNs) that obey the symmetric degeneracy conditions of the GENERIC formalism. GFINNs comprise two modules, each of which contains two components. We model each component using a neural network whose architecture is designed to satisfy the required conditions. The component-wise architecture design provides flexible ways of leveraging available physics information into neural networks. We prove theoretically that GFINNs are sufficiently expressive to learn the underlying equations, hence establishing the universal approximation theorem. We demonstrate the performance of GFINNs in three simulation problems: gas containers exchanging heat and volume, thermoelastic double pendulum and the Langevin dynamics. In all the examples, GFINNs outperform existing methods, hence demonstrating good accuracy in predictions for both deterministic and stochastic systems.}, number={2229}, journal={Philosophical Transactions of the Royal Society A: Mathematical, Physical and Engineering Sciences}, publisher={The Royal Society}, author={Zhang, Zhen and Shin, Yeonjong and Karniadakis, George Em}, year={2022}, month={Jun} } @article{hou_shin_xiu_2021, title={Identification of Corrupted Data via $k$-Means Clustering for Function Approximation}, volume={2}, ISSN={2708-0560 2708-0579}, url={http://dx.doi.org/10.4208/csiam-am.2020-0212}, DOI={10.4208/csiam-am.2020-0212}, abstractNote={In addition to measurement noises, real world data are often corrupted by unexpected internal or external errors.Corruption errors can be much larger than the standard noises and negatively affect data processing results.In this paper, we propose a method of identifying corrupted data in the context of function approximation.The method is a two-step procedure consisting of approximation stage and identification stage.In the approximation stage, we conduct straightforward function approximation to the entire data set for preliminary processing.In the identification stage, a clustering algorithm is applied to the processed data to identify the potentially corrupted data entries.In particular, we found k-means clustering algorithm to be highly effective.Our theoretical analysis reveal that under sufficient conditions the proposed method can exactly identify all corrupted data entries.Numerous examples are provided to verify our theoretical findings and demonstrate the effectiveness of the method.}, number={1}, journal={CSIAM Transactions on Applied Mathematics}, publisher={Global Science Press}, author={Hou, Jun and Shin, Y. and Xiu, D.}, year={2021}, month={Jun}, pages={81–107} } @article{ainsworth_shin_2021, title={Plateau Phenomenon in Gradient Descent Training of RELU Networks: Explanation, Quantification, and Avoidance}, volume={43}, url={http://dx.doi.org/10.1137/20m1353010}, DOI={10.1137/20m1353010}, abstractNote={The ability of neural networks to provide `best in class' approximation across a wide range of applications is well-documented. Nevertheless, the powerful expressivity of neural networks comes to naught if one is unable to effectively train (choose) the parameters defining the network. In general, neural networks are trained by gradient descent type optimization methods, or a stochastic variant thereof. In practice, such methods result in the loss function decreases rapidly at the beginning of training but then, after a relatively small number of steps, significantly slow down. The loss may even appear to stagnate over the period of a large number of epochs, only to then suddenly start to decrease fast again for no apparent reason. This so-called plateau phenomenon manifests itself in many learning tasks. The present work aims to identify and quantify the root causes of plateau phenomenon. No assumptions are made on the number of neurons relative to the number of training data, and our results hold for both the lazy and adaptive regimes. The main findings are: plateaux correspond to periods during which activation patterns remain constant, where activation pattern refers to the number of data points that activate a given neuron; quantification of convergence of the gradient flow dynamics; and, characterization of stationary points in terms solutions of local least squares regression lines over subsets of the training data. Based on these conclusions, we propose a new iterative training method, the Active Neuron Least Squares (ANLS), characterised by the explicit adjustment of the activation pattern at each step, which is designed to enable a quick exit from a plateau. Illustrative numerical examples are included throughout.}, number={5}, journal={SIAM Journal on Scientific Computing}, publisher={Society for Industrial & Applied Mathematics (SIAM)}, author={Ainsworth, Mark and Shin, Yeonjong}, year={2021}, month={Jan}, pages={A3438–A3468} } @article{lu_shin_su_karniadakis_2020, title={Dying ReLU and Initialization: Theory and Numerical Examples}, volume={28}, ISSN={1815-2406 1991-7120}, url={http://dx.doi.org/10.4208/cicp.oa-2020-0165}, DOI={10.4208/cicp.oa-2020-0165}, abstractNote={Recent theoretical work has demonstrated that deep neural networks have superior performance over shallow networks, but their training is more difficult, e.g., they suffer from the vanishing gradient problem. This problem can be typically resolved by the rectified linear unit (ReLU) activation. However, here we show that even for such activation, deep and narrow neural networks (NNs) will converge to erroneous mean or median states of the target function depending on the loss with high probability. Deep and narrow NNs are encountered in solving partial differential equations with high-order derivatives. We demonstrate this collapse of such NNs both numerically and theoretically, and provide estimates of the probability of collapse. We also construct a diagram of a safe region for designing NNs that avoid the collapse to erroneous states. Finally, we examine different ways of initialization and normalization that may avoid the collapse problem. Asymmetric initializations may reduce the probability of collapse but do not totally eliminate it.}, number={5}, journal={Communications in Computational Physics}, publisher={Global Science Press}, author={Lu, Lu and Shin, Y. and Su, Y. and Karniadakis, G. E.}, year={2020}, month={Jun}, pages={1671–1706} } @article{shin_darbon_karniadakis_2020, title={On the Convergence of Physics Informed Neural Networks for Linear Second-Order Elliptic and Parabolic Type PDEs}, volume={28}, ISSN={1815-2406 1991-7120}, url={http://dx.doi.org/10.4208/cicp.oa-2020-0193}, DOI={10.4208/cicp.oa-2020-0193}, abstractNote={Physics informed neural networks (PINNs) are deep learning based techniques for solving partial differential equations (PDEs) encounted in computational science and engineering. Guided by data and physical laws, PINNs find a neural network that approximates the solution to a system of PDEs. Such a neural network is obtained by minimizing a loss function in which any prior knowledge of PDEs and data are encoded. Despite its remarkable empirical success in one, two or three dimensional problems, there is little theoretical justification for PINNs. As the number of data grows, PINNs generate a sequence of minimizers which correspond to a sequence of neural networks. We want to answer the question: Does the sequence of minimizers converge to the solution to the PDE? We consider two classes of PDEs: linear second-order elliptic and parabolic. By adapting the Schauder approach and the maximum principle, we show that the sequence of minimizers strongly converges to the PDE solution in $C^0$. Furthermore, we show that if each minimizer satisfies the initial/boundary conditions, the convergence mode becomes $H^1$. Computational examples are provided to illustrate our theoretical findings. To the best of our knowledge, this is the first theoretical work that shows the consistency of PINNs.}, number={5}, journal={Communications in Computational Physics}, publisher={Global Science Press}, author={Shin, Yeonjong and Darbon, J. and Karniadakis, G. E.}, year={2020}, month={Jun}, pages={2042–2074} } @article{shin_karniadakis_2020, title={TRAINABILITY OF ReLU NETWORKS AND DATA-DEPENDENT INITIALIZATION}, volume={1}, url={http://dx.doi.org/10.1615/jmachlearnmodelcomput.2020034126}, DOI={10.1615/jmachlearnmodelcomput.2020034126}, abstractNote={In this paper we study the trainability of rectified linear unit (ReLU) networks at initialization. A ReLU neuron is said to be dead if it only outputs a constant for any input. Two death states of neurons are introduced−tentative and permanent death. A network is then said to be trainable if the number of permanently dead neurons is sufficiently small for a learning task. We refer to the probability of a randomly initialized network being trainable as trainability. We show that a network being trainable is a necessary condition for successful training, and the trainability serves as an upper bound of training success rates. In order to quantify the trainability, we study the probability distribution of the number of active neurons at initialization. In many applications, overspecified or overparameterized neural networks are successfully employed and shown to be trained effectively. With the notion of trainability, we show that overparameterization is both a necessary and a sufficient condition for achieving a zero training loss. Furthermore, we propose a data-dependent initialization method in an overparameterized setting. Numerical examples are provided to demonstrate the effectiveness of the method and our theoretical findings.}, number={1}, journal={Journal of Machine Learning for Modeling and Computing}, publisher={Begell House}, author={Shin, Yeonjong and Karniadakis, George Em}, year={2020}, pages={39–74} } @article{shin_wu_xiu_2018, title={Sequential function approximation with noisy data}, volume={371}, url={http://dx.doi.org/10.1016/j.jcp.2018.05.042}, DOI={10.1016/j.jcp.2018.05.042}, abstractNote={We present a sequential method for approximating an unknown function sequentially using random noisy samples. Unlike the traditional function approximation methods, the current method constructs the approximation using one sample at a time. This results in a simple numerical implementation using only vector operations and avoids the need to store the entire data set. The method is thus particularly suitable when data set is exceedingly large. Furthermore, we present a general theoretical framework to define and interpret the method. Both upper and lower bounds of the method are established for the expectation of the results. Numerical examples are provided to verify the theoretical findings.}, journal={Journal of Computational Physics}, publisher={Elsevier BV}, author={Shin, Yeonjong and Wu, Kailiang and Xiu, Dongbin}, year={2018}, month={Oct}, pages={363–381} } @article{shin_xiu_2017, title={A Randomized Algorithm for Multivariate Function Approximation}, volume={39}, url={http://dx.doi.org/10.1137/16m1075193}, DOI={10.1137/16m1075193}, abstractNote={The randomized Kaczmarz (RK) method is a randomized iterative algorithm for solving (overdetermined) linear systems of equations. In this paper, we extend the RK method to function approximation in a bounded domain. We demonstrate that by conducting the approximation randomly one sample at a time the method converges. Convergence analysis is conducted in terms of expectation, where we establish sharp upper and lower bounds for both the convergence rate of the algorithm and the error of the resulting approximation. The analysis also establishes the optimal sampling probability measure to achieve the optimal rate of convergence. Various numerical examples are provided to validate the theoretical results.}, number={3}, journal={SIAM Journal on Scientific Computing}, publisher={Society for Industrial & Applied Mathematics (SIAM)}, author={Shin, Yeonjong and Xiu, Dongbin}, year={2017}, month={Jan}, pages={A983–A1002} } @article{wu_shin_xiu_2017, title={A Randomized Tensor Quadrature Method for High Dimensional Polynomial Approximation}, volume={39}, url={http://dx.doi.org/10.1137/16m1081695}, DOI={10.1137/16m1081695}, abstractNote={We present a numerical method for polynomial approximation of multivariate functions. The method utilizes Gauss quadrature in tensor product form, which is known to be inefficient in high dimensions. Here we demonstrate that by using a new randomized algorithm and taking advantage of the tensor structure of the grids, a highly efficient algorithm can be constructed. The new method does not require prior knowledge/storage of the entire data set at all the tensor grid points, whose total number of points is excessively large in high dimensions. Instead the method utilizes one data point at a time and iteratively conducts the approximation. This feature allows the use of the method irrespective of the size of the data set. We establish the rate of convergence of this iterative algorithm and show that its operational counts can be lower than the standard methods, when applicable, such as least squares. Numerical examples in up to hundreds of dimensions are presented to verify the theoretical analysis and demo...}, number={5}, journal={SIAM Journal on Scientific Computing}, publisher={Society for Industrial & Applied Mathematics (SIAM)}, author={Wu, Kailiang and Shin, Yeonjong and Xiu, Dongbin}, year={2017}, month={Jan}, pages={A1811–A1833} } @article{yan_shin_xiu_2017, title={Sparse Approximation using $\ell_1-\ell_2$ Minimization and Its Application to Stochastic Collocation}, volume={39}, url={http://dx.doi.org/10.1137/15m103947x}, DOI={10.1137/15m103947x}, abstractNote={We discuss the properties of sparse approximation using $\ell_1-\ell_2$ minimization. We present several theoretical estimates regarding its recoverability for both sparse and nonsparse signals. We then apply the method to sparse orthogonal polynomial approximations for stochastic collocation, with a focus on the use of Legendre polynomials. We study the recoverability of both the standard $\ell_1-\ell_2$ minimization and Chebyshev weighted $\ell_1-\ell_2$ minimization. It is noted that the Chebyshev weighted version is advantageous only at low dimensions, whereas the standard nonweighted version is preferred in high dimensions. Various numerical examples are presented to verify the theoretical findings.}, number={1}, journal={SIAM Journal on Scientific Computing}, publisher={Society for Industrial & Applied Mathematics (SIAM)}, author={Yan, Liang and Shin, Yeonjong and Xiu, Dongbin}, year={2017}, month={Jan}, pages={A229–A254} } @article{shin_xiu_2016, title={Correcting Data Corruption Errors for Multivariate Function Approximation}, volume={38}, url={http://dx.doi.org/10.1137/16m1059473}, DOI={10.1137/16m1059473}, abstractNote={We discuss the problem of constructing an accurate function approximation when data are corrupted by unexpected errors. The unexpected corruption errors are different from the standard observational noise in the sense that they can have much larger magnitude and in most cases are sparse. By focusing on overdetermined case, we prove that the sparse corruption errors can be effectively eliminated by using $\ell_1$-minimization, also known as the least absolute deviations method. In particular, we establish probabilistic error bounds of the $\ell_1$-minimization solution with the corrupted data. Both the lower bound and the upper bound are related only to the errors of the $\ell_1$- and $\ell_2$-minimization solutions with respect to the uncorrupted data and the sparsity of the corruption errors. This ensures that the $\ell_1$-minimization solution with the corrupted data are close to the regression results with uncorrupted data, thus effectively eliminating the corruption errors. Several numerical examples ...}, number={4}, journal={SIAM Journal on Scientific Computing}, publisher={Society for Industrial & Applied Mathematics (SIAM)}, author={Shin, Yeonjong and Xiu, Dongbin}, year={2016}, month={Jan}, pages={A2492–A2511} } @article{shin_xiu_2016, title={Nonadaptive Quasi-Optimal Points Selection for Least Squares Linear Regression}, volume={38}, url={http://dx.doi.org/10.1137/15m1015868}, DOI={10.1137/15m1015868}, abstractNote={In this paper we present a quasi-optimal sample set for ordinary least squares (OLS) regression. The quasi-optimal set is designed in such a way that, for a given number of samples, it can deliver the regression result as close as possible to the result obtained by a (much) larger set of candidate samples. The quasi-optimal set is determined by maximizing a quantity measuring the mutual column orthogonality and the determinant of the model matrix. This procedure is nonadaptive, in the sense that it does not depend on the sample data. This is useful in practice, as it allows one to determine, prior to the potentially expensive data collection procedure, where to sample the underlying system. In addition to presenting the theoretical motivation of the quasi-optimal set, we also present its efficient implementation via a greedy algorithm, along with several numerical examples to demonstrate its efficacy. Since the quasi-optimal set allows one to obtain a near optimal regression result under any affordable nu...}, number={1}, journal={SIAM Journal on Scientific Computing}, publisher={Society for Industrial & Applied Mathematics (SIAM)}, author={Shin, Yeonjong and Xiu, Dongbin}, year={2016}, month={Jan}, pages={A385–A411} } @article{shin_xiu_2016, title={On a near optimal sampling strategy for least squares polynomial regression}, volume={326}, url={http://dx.doi.org/10.1016/j.jcp.2016.09.032}, DOI={10.1016/j.jcp.2016.09.032}, abstractNote={We present a sampling strategy of least squares polynomial regression. The strategy combines two recently developed methods for least squares method: Christoffel least squares algorithm and quasi-optimal sampling. More specifically, our new strategy first choose samples from the pluripotential equilibrium measure and then re-order the samples by the quasi-optimal algorithm. A weighted least squares problem is solved on a (much) smaller sample set to obtain the regression result. It is then demonstrated that the new strategy results in a polynomial least squares method with high accuracy and robust stability at almost minimal number of samples.}, journal={Journal of Computational Physics}, publisher={Elsevier BV}, author={Shin, Yeonjong and Xiu, Dongbin}, year={2016}, month={Dec}, pages={931–946} }