@article{howell_baboulin_2018, title={Iterative Solution of Sparse Linear Least Squares using LU Factorization}, DOI={10.1145/3149457.3149462}, abstractNote={In this paper, we are interested in computing the solution of an overdetermined sparse linear least squares problem Ax=b via the normal equations method. Transforming the normal equations using the L factor from a rectangular LU decomposition of A usually leads to a better conditioned problem. Here we explore a further preconditioning by inv(L1) where L1 is the n × n upper part of the lower trapezoidal m × n factor L. Since the condition number of the iteration matrix can be easily bounded, we can determine whether the iteration will be effective, and whether further pre-conditioning is required. Numerical experiments are performed with the Julia programming language. When the upper triangular matrix U has no near zero diagonal elements, the algorithm is observed to be reliable. When A has only a few more rows than columns, convergence requires relatively few iterations and the algorithm usually requires less storage than the Cholesky factor of AtA or the R factor of the QR factorization of A.}, journal={PROCEEDINGS OF THE INTERNATIONAL CONFERENCE ON HIGH PERFORMANCE COMPUTING IN ASIA-PACIFIC REGION (HPC ASIA 2018)}, author={Howell, Gary W. and Baboulin, Marc}, year={2018}, pages={47–53} }
 @article{howell_baboulin_2016, title={LU Preconditioning for Overdetermined Sparse Least Squares Problems}, volume={9573}, ISBN={["978-3-319-32148-6"]}, ISSN={["1611-3349"]}, DOI={10.1007/978-3-319-32149-3_13}, abstractNote={We investigate how to use an LU factorization with the classical lsqr routine for solving overdetermined sparse least squares problems. Usually L is much better conditioned than A and iterating with L instead of A results in faster convergence. When a runtime test indicates that L is not sufficiently well-conditioned, a partial orthogonalization of L accelerates the convergence. Numerical experiments illustrate the good behavior of our algorithm in terms of storage and convergence.}, journal={PARALLEL PROCESSING AND APPLIED MATHEMATICS, PPAM 2015, PT I}, author={Howell, Gary W. and Baboulin, Marc}, year={2016}, pages={128–137} }
 @article{howell_demmel_fulton_hammarling_marmol_2008, title={Cache efficient bidiagonalization using BLAS 2.5 operators}, volume={34}, ISSN={["1557-7295"]}, DOI={10.1145/1356052.1356055}, abstractNote={On cache based computer architectures using current standard algorithms, Householder bidiagonalization requires a significant portion of the execution time for computing matrix singular values and vectors. In this paper we reorganize the sequence of operations for Householder bidiagonalization of a general m × n matrix, so that two (_GEMV) vector-matrix multiplications can be done with one pass of the unreduced trailing part of the matrix through cache. Two new BLAS operations approximately cut in half the transfer of data from main memory to cache, reducing execution times by up to 25 per cent. We give detailed algorithm descriptions and compare timings with the current LAPACK bidiagonalization algorithm.}, number={3}, journal={ACM TRANSACTIONS ON MATHEMATICAL SOFTWARE}, author={Howell, Gary W. and Demmel, James W. and Fulton, Charles T. and Hammarling, Sven and Marmol, Karen}, year={2008} }
 @article{howell_diaa_2005, title={Algorithm 841: BHESS: Gaussian reduction to a similar banded Hessenberg form}, volume={31}, ISSN={["1557-7295"]}, DOI={10.1145/1055531.1055539}, abstractNote={BHESS uses Gaussian similarity transformations to reduce a general real square matrix to similar upper Hessenberg form. Multipliers are bounded in root mean square by a user-supplied parameter. If the input matrix is not highly nonnormal and the user-supplied tolerance on multipliers is of a size greater than ten, the returned matrix usually has small upper bandwidth. In such a case, eigenvalues of the returned matrix can be determined by the bulge-chasing BR iteration or by Rayleigh quotient iteration. BHESS followed by BR iteration determines a complete spectrum in about one-fifth the time required for orthogonal reduction to Hessenberg form followed by QR iterations. The FORTRAN 77 code provided for BHESS runs efficiently on a cache-based architecture.}, number={1}, journal={ACM TRANSACTIONS ON MATHEMATICAL SOFTWARE}, author={Howell, GW and Diaa, N}, year={2005}, month={Mar}, pages={166–185} }