@article{chi_ipsen_2021, title={A projector-based approach to quantifying total and excess uncertainties for sketched linear regression}, volume={8}, ISSN={["2049-8772"]}, url={http://dx.doi.org/10.1093/imaiai/iaab016}, DOI={10.1093/imaiai/iaab016}, abstractNote={ Linear regression is a classic method of data analysis. In recent years, sketching—a method of dimension reduction using random sampling, random projections or both—has gained popularity as an effective computational approximation when the number of observations greatly exceeds the number of variables. In this paper, we address the following question: how does sketching affect the statistical properties of the solution and key quantities derived from it? To answer this question, we present a projector-based approach to sketched linear regression that is exact and that requires minimal assumptions on the sketching matrix. Therefore, downstream analyses hold exactly and generally for all sketching schemes. Additionally, a projector-based approach enables derivation of key quantities from classic linear regression that account for the combined model- and algorithm-induced uncertainties. We demonstrate the usefulness of a projector-based approach in quantifying and enabling insight on excess uncertainties and bias-variance decompositions for sketched linear regression. Finally, we demonstrate how the insights from our projector-based analyses can be used to produce practical sketching diagnostics to aid the design of judicious sketching schemes.}, journal={INFORMATION AND INFERENCE-A JOURNAL OF THE IMA}, publisher={Oxford University Press (OUP)}, author={Chi, Jocelyn T. and Ipsen, Ilse C. F.}, year={2021}, month={Aug} } @article{qin_lee_chi_drumetz_chanussot_lou_bertozzi_2021, title={Blind Hyperspectral Unmixing Based on Graph Total Variation Regularization}, volume={59}, ISSN={["1558-0644"]}, DOI={10.1109/TGRS.2020.3020810}, abstractNote={Remote sensing data from hyperspectral cameras suffer from limited spatial resolution, in which a single pixel of a hyperspectral image may contain information from several materials in the field of view. Blind hyperspectral image unmixing is the process of identifying the pure spectra of individual materials (i.e., endmembers) and their proportions (i.e., abundances) at each pixel. In this article, we propose a novel blind hyperspectral unmixing model based on the graph total variation (gTV) regularization, which can be solved efficiently by the alternating direction method of multipliers (ADMM). To further alleviate the computational cost, we apply the Nyström method to approximate a fully connected graph by a small subset of sampled points. Furthermore, we adopt the Merriman–Bence–Osher (MBO) scheme to solve the gTV-involved subproblem in ADMM by decomposing a gray-scale image into a bitwise form. A variety of numerical experiments on synthetic and real hyperspectral images are conducted, showcasing the potential of the proposed method in terms of identification accuracy and computational efficiency.}, number={4}, journal={IEEE TRANSACTIONS ON GEOSCIENCE AND REMOTE SENSING}, author={Qin, Jing and Lee, Harlin and Chi, Jocelyn T. and Drumetz, Lucas and Chanussot, Jocelyn and Lou, Yifei and Bertozzi, Andrea L.}, year={2021}, month={Apr}, pages={3338–3351} } @article{chi_ipsen_2021, title={Multiplicative perturbation bounds for multivariate multiple linear regression in Schatten p-norms}, volume={624}, ISSN={["1873-1856"]}, url={https://doi.org/10.1016/j.laa.2021.03.039}, DOI={10.1016/j.laa.2021.03.039}, abstractNote={Multivariate multiple linear regression (MMLR), which occurs in a number of practical applications, generalizes traditional least squares (multivariate linear regression) to multiple right-hand sides. We extend recent MLR analyses to sketched MMLR in general Schatten p-norms by interpreting the sketched problem as a multiplicative perturbation. Our work represents an extension of Maher's results on Schatten p-norms. We derive expressions for the exact and perturbed solutions in terms of projectors for easy geometric interpretation. We also present a geometric interpretation of the action of the sketching matrix in terms of relevant subspaces. We show that a key term in assessing the accuracy of the sketched MMLR solution can be viewed as a tangent of a largest principal angle between subspaces under some assumptions. Our results enable additional interpretation of the difference between an orthogonal and oblique projector with the same range.}, journal={LINEAR ALGEBRA AND ITS APPLICATIONS}, publisher={Elsevier BV}, author={Chi, Jocelyn T. and Ipsen, Ilse C. F.}, year={2021}, month={Sep}, pages={87–102} } @article{chi_ipsen_hsiao_lin_wang_lee_lu_tzeng_2021, title={SEAGLE: A Scalable Exact Algorithm for Large-Scale Set-Based Gene-Environment Interaction Tests in Biobank Data}, volume={12}, ISSN={["1664-8021"]}, DOI={10.3389/fgene.2021.710055}, abstractNote={The explosion of biobank data offers unprecedented opportunities for gene-environment interaction (GxE) studies of complex diseases because of the large sample sizes and the rich collection in genetic and non-genetic information. However, the extremely large sample size also introduces new computational challenges in G×E assessment, especially for set-based G×E variance component (VC) tests, which are a widely used strategy to boost overall G×E signals and to evaluate the joint G×E effect of multiple variants from a biologically meaningful unit (e.g., gene). In this work, we focus on continuous traits and present SEAGLE, a Scalable Exact AlGorithm for Large-scale set-based G×E tests, to permit G×E VC tests for biobank-scale data. SEAGLE employs modern matrix computations to calculate the test statistic and p-value of the GxE VC test in a computationally efficient fashion, without imposing additional assumptions or relying on approximations. SEAGLE can easily accommodate sample sizes in the order of 105, is implementable on standard laptops, and does not require specialized computing equipment. We demonstrate the performance of SEAGLE using extensive simulations. We illustrate its utility by conducting genome-wide gene-based G×E analysis on the Taiwan Biobank data to explore the interaction of gene and physical activity status on body mass index.}, journal={FRONTIERS IN GENETICS}, author={Chi, Jocelyn T. and Ipsen, Ilse C. F. and Hsiao, Tzu-Hung and Lin, Ching-Heng and Wang, Li-San and Lee, Wan-Ping and Lu, Tzu-Pin and Tzeng, Jung-Ying}, year={2021}, month={Nov} } @article{chi_chi_baraniuk_2016, title={k-POD: A Method for k-Means Clustering of Missing Data}, volume={70}, ISSN={["1537-2731"]}, url={http://dx.doi.org/10.1080/00031305.2015.1086685}, DOI={10.1080/00031305.2015.1086685}, abstractNote={The k-means algorithm is often used in clustering applications but its usage requires a complete data matrix. Missing data, however, are common in many applications. Mainstream approaches to clustering missing data reduce the missing data problem to a complete data formulation through either deletion or imputation but these solutions may incur significant costs. Our k-POD method presents a simple extension of k-means clustering for missing data that works even when the missingness mechanism is unknown, when external information is unavailable, and when there is significant missingness in the data. [Received November 2014. Revised August 2015.]}, number={1}, journal={AMERICAN STATISTICIAN}, author={Chi, Jocelyn T. and Chi, Eric C. and Baraniuk, Richard G.}, year={2016}, month={Jan}, pages={91–99} }