@article{jeng_hu_sun_li_2024, title={WEAK SIGNAL INCLUSION UNDER DEPENDENCE AND APPLICATIONS IN GENOME-WIDE ASSOCIATION STUDY}, volume={18}, ISSN={["1941-7330"]}, DOI={10.1214/23-AOAS1815}, abstractNote={Motivated by the inquiries of weak signals in underpowered genome-wide association studies (GWASs), we consider the problem of retaining true signals that are not strong enough to be individually separable from a large amount of noise. We address the challenge from the perspective of false negative control and present false negative control (FNC) screening, a data-driven method to efficiently regulate false negative proportion at a user-specified level. FNC screening is developed in a realistic setting with arbitrary covariance dependence between variables. We calibrate the overall dependence through a parameter whose scale is compatible with the existing phase diagram in high-dimensional sparse inference. Utilizing the new calibration, we asymptotically explicate the joint effect of covariance dependence, signal sparsity, and signal intensity on the proposed method. We interpret the results using a new phase diagram, which shows that FNC screening can efficiently select a set of candidate variables to retain a high proportion of signals even when the signals are not individually separable from noise. Finite sample performance of FNC screening is compared to those of several existing methods in simulation studies. The proposed method outperforms the others in adapting to a user-specified false negative control level. We implement FNC screening to empower a two-stage GWAS procedure, which demonstrates substantial power gain when working with limited sample sizes in real applications.}, number={1}, journal={ANNALS OF APPLIED STATISTICS}, author={Jeng, X. Jessie and Hu, Yifei and Sun, Quan and Li, Yun}, year={2024}, month={Mar}, pages={841–857} } @article{jeng_2023, title={Estimating the proportion of signal variables under arbitrary covariance dependence}, volume={17}, ISSN={["1935-7524"]}, DOI={10.1214/23-EJS2119}, abstractNote={Accurately estimating the proportion of signals hidden in a large amount of noise variables is of interest in many scientific inquires. In this paper, we consider realistic but theoretically challenging settings with arbitrary covariance dependence between variables. We define mean absolute correlation (MAC) to measure the overall dependence strength and investigate a family of estimators for their performances in the full range of MAC. We explicit the joint effect of MAC and signal sparsity on the performances of the family of estimators and discover that the most powerful estimator under independence is no longer most effective when the MAC dependence is strong enough. Motivated by the theoretical insight, we propose a new estimator to better adapt to arbitrary covariance dependence. The proposed method compares favorably to several existing methods in extensive finite-sample settings with strong to weak covariance dependence and real dependence structures from genetic association studies.}, number={1}, journal={ELECTRONIC JOURNAL OF STATISTICS}, author={Jeng, X. Jessie}, year={2023}, pages={950–979} } @article{jeng_hu_venkat_lu_tzeng_2023, title={Transfer learning with false negative control improves polygenic risk prediction}, volume={19}, ISSN={["1553-7404"]}, DOI={10.1371/journal.pgen.1010597}, abstractNote={Polygenic risk score (PRS) is a quantity that aggregates the effects of variants across the genome and estimates an individual’s genetic predisposition for a given trait. PRS analysis typically contains two input data sets: base data for effect size estimation and target data for individual-level prediction. Given the availability of large-scale base data, it becomes more common that the ancestral background of base and target data do not perfectly match. In this paper, we treat the GWAS summary information obtained in the base data as knowledge learned from a pre-trained model, and adopt a transfer learning framework to effectively leverage the knowledge learned from the base data that may or may not have similar ancestral background as the target samples to build prediction models for target individuals. Our proposed transfer learning framework consists of two main steps: (1) conducting false negative control (FNC) marginal screening to extract useful knowledge from the base data; and (2) performing joint model training to integrate the knowledge extracted from base data with the target training data for accurate trans-data prediction. This new approach can significantly enhance the computational and statistical efficiency of joint-model training, alleviate over-fitting, and facilitate more accurate trans-data prediction when heterogeneity level between target and base data sets is small or high.}, number={11}, journal={PLOS GENETICS}, author={Jeng, Xinge Jessie and Hu, Yifei and Venkat, Vaishnavi and Lu, Tzu-Pin and Tzeng, Jung-Ying}, year={2023}, month={Nov} } @article{jeng_peng_lu_2021, title={Model Selection With Mixed Variables on the Lasso Path}, volume={83}, ISSN={["0976-8394"]}, DOI={10.1007/s13571-019-00219-5}, number={1}, journal={SANKHYA-SERIES B-APPLIED AND INTERDISCIPLINARY STATISTICS}, author={Jeng, X. Jessie and Peng, Huimin and Lu, Wenbin}, year={2021}, month={May}, pages={170–184} } @article{jeng_rhyne_zhang_tzeng_2020, title={Effective SNP ranking improves the performance of eQTL mapping}, volume={44}, ISSN={["1098-2272"]}, DOI={10.1002/gepi.22293}, abstractNote={Abstract}, number={6}, journal={GENETIC EPIDEMIOLOGY}, author={Jeng, X. Jessie and Rhyne, Jacob and Zhang, Teng and Tzeng, Jung-Ying}, year={2020}, month={Sep}, pages={611–619} } @article{rhyne_jeng_chi_tzeng_2020, title={FastLORS: Joint modelling for expression quantitative trait loci mapping in R}, volume={9}, ISSN={["2049-1573"]}, url={https://doi.org/10.1002/sta4.265}, DOI={10.1002/sta4.265}, abstractNote={FastLORS is a software package that implements a new algorithm to solve sparse multivariate regression for expression quantitative trait loci (eQTLs) mapping. FastLORS solves the same optimization problem as LORS, an existing popular algorithm. The optimization problem is solved through inexact block coordinate descent with updates by proximal gradient steps, which reduces the computational cost compared with LORS. We apply LORS and FastLORS to a real dataset for eQTL mapping and demonstrate that FastLORS delivers comparable results with LORS in much less computing time.}, number={1}, journal={STAT}, publisher={Wiley}, author={Rhyne, Jacob and Jeng, X. Jessie and Chi, Eric C. and Tzeng, Jung-Ying}, year={2020} } @article{jeng_chen_2019, title={Predictor ranking and false discovery proportion control in high-dimensional regression}, volume={171}, ISBN={0047-259X}, DOI={10.1016/j.jmva.2018.12.006}, abstractNote={We propose a ranking and selection procedure to prioritize relevant predictors and control false discovery proportion (FDP) in variable selection. Our procedure utilizes a new ranking method built upon the de-sparsified Lasso estimator. We show that the new ranking method achieves the optimal order of minimum non-zero effects in ranking relevant predictors ahead of irrelevant ones. Adopting the new ranking method, we develop a variable selection procedure to asymptotically control FDP at a user-specified level. We show that our procedure can consistently estimate the FDP of variable selection as long as the de-sparsified Lasso estimator is asymptotically normal. In simulations, our procedure compares favorably to existing methods in ranking efficiency and FDP control when the regression model is relatively sparse.}, journal={JOURNAL OF MULTIVARIATE ANALYSIS}, author={Jeng, X. Jessie and Chen, Xiongzhi}, year={2019}, month={May}, pages={163–175} } @article{jeng_chen_2019, title={Variable selection via adaptive false negative control in linear regression}, volume={13}, ISSN={["1935-7524"]}, DOI={10.1214/19-EJS1649}, abstractNote={Variable selection methods have been developed in linear regression to provide sparse solutions. Recent studies have focused on further interpretations on the sparse solutions in terms of false positive control. In this paper, we consider false negative control for variable selection with the goal to efficiently select a high proportion of relevant predictors. Different from existing studies in power analysis and sure screening, we propose to directly estimate the false negative proportion (FNP) of a decision rule and select the smallest subset of predictors that has the estimated FNP less than a user-specified control level. The proposed method is adaptive to the user-specified control level on FNP by selecting less candidates if a higher level is implemented. On the other hand, when data has stronger effect size or larger sample size, the proposed method controls FNP more efficiently with less false positives. New analytic techniques are developed to cope with the major challenge of FNP control when relevant predictors cannot be consistently separated from irrelevant ones. Our numerical results are in line with the theoretical findings.}, number={2}, journal={ELECTRONIC JOURNAL OF STATISTICS}, author={Jeng, X. Jessie and Chen, Xiongzhi}, year={2019}, pages={5306–5333} } @article{jeng_zhang_tzeng_2018, title={Efficient Signal Inclusion With Genomic Applications}, ISSN={0162-1459 1537-274X}, url={http://dx.doi.org/10.1080/01621459.2018.1518236}, DOI={10.1080/01621459.2018.1518236}, abstractNote={Abstract This article addresses the challenge of efficiently capturing a high proportion of true signals for subsequent data analyses when sample sizes are relatively limited with respect to data dimension. We propose the signal missing rate (SMR) as a new measure for false-negative control to account for the variability of false-negative proportion. Novel data-adaptive procedures are developed to control SMR without incurring many unnecessary false positives under dependence. We justify the efficiency and adaptivity of the proposed methods via theory and simulation. The proposed methods are applied to GWAS on human height to effectively remove irrelevant single nucleotide polymorphisms (SNPs) while retaining a high proportion of relevant SNPs for subsequent polygenic analysis. Supplementary materials for this article are available online.}, journal={Journal of the American Statistical Association}, publisher={Informa UK Limited}, author={Jeng, X. Jessie and Zhang, Teng and Tzeng, Jung-Ying}, year={2018}, month={Sep}, pages={1–23} } @article{jeng_lu_peng_2018, title={High-dimensional inference for personalized treatment decision}, volume={12}, ISSN={["1935-7524"]}, DOI={10.1214/18-ejs1439}, abstractNote={Recent development in statistical methodology for personalized treatment decision has utilized high-dimensional regression to take into account a large number of patients' covariates and described personalized treatment decision through interactions between treatment and covariates. While a subset of interaction terms can be obtained by existing variable selection methods to indicate relevant covariates for making treatment decision, there often lacks statistical interpretation of the results. This paper proposes an asymptotically unbiased estimator based on Lasso solution for the interaction coefficients. We derive the limiting distribution of the estimator when baseline function of the regression model is unknown and possibly misspecified. Confidence intervals and p-values are derived to infer the effects of the patients' covariates in making treatment decision. We confirm the accuracy of the proposed method and its robustness against misspecified function in simulation and apply the method to STAR*D study for major depression disorder.}, number={1}, journal={ELECTRONIC JOURNAL OF STATISTICS}, author={Jeng, X. Jessie and Lu, Wenbin and Peng, Huimin}, year={2018}, pages={2074–2089} } @article{jeng_2016, title={Detecting weak signals in high dimensions}, volume={147}, ISSN={["0047-259X"]}, DOI={10.1016/j.jmva.2016.02.004}, abstractNote={Fast emerging high-throughput technology advances scientific applications into a new era by enabling detection of information-bearing signals with unprecedented sizes. Despite its potential, the analysis of ultrahigh-dimensional data involves fundamental challenges, wherein the deluge of a large amount of irrelevant data can easily obscure the true signals. Classical statistical methods for low to moderate-dimensional data focus on identifying strong true signals using false positive control criteria. These methods, however, have limited power for identifying weak true signals embedded in an extremely large amount of noise. This paper seeks to facilitate the detection of weak signals by introducing a new approach based on false negative instead of false positive control. As a result, a high proportion of weak signals can be retained for follow-up study. The new procedure is completely data-driven and fast in computation. We show in theory its efficiency and adaptivity to the unknown features of the data including signal intensity and sparsity. Simulation studies further evaluate the method under various model settings. We apply the new method in a real-data analysis on detecting genomic variants with varying signal intensities.}, journal={JOURNAL OF MULTIVARIATE ANALYSIS}, author={Jeng, X. Jessie}, year={2016}, month={May}, pages={234–246} } @article{jeng_daye_lu_tzeng_2016, title={Rare variants association analysis in large-scale sequencing studies at the single locus level}, volume={12}, number={6}, journal={PLoS Computational Biology}, author={Jeng, X. J. and Daye, Z. J. and Lu, W. B. and Tzeng, J. Y.}, year={2016} } @article{song_lu_ma_jeng_2014, title={Censored rank independence screening for high-dimensional survival data}, volume={101}, ISSN={0006-3444 1464-3510}, url={http://dx.doi.org/10.1093/biomet/asu047}, DOI={10.1093/biomet/asu047}, abstractNote={In modern statistical applications, the dimension of covariates can be much larger than the sample size. In the context of linear models, correlation screening (Fan and Lv, 2008) has been shown to reduce the dimension of such data effectively while achieving the sure screening property, i.e., all of the active variables can be retained with high probability. However, screening based on the Pearson correlation does not perform well when applied to contaminated covariates and/or censored outcomes. In this paper, we study censored rank independence screening of high-dimensional survival data. The proposed method is robust to predictors that contain outliers, works for a general class of survival models, and enjoys the sure screening property. Simulations and an analysis of real data demonstrate that the proposed method performs competitively on survival data sets of moderate size and high-dimensional predictors, even when these are contaminated.}, number={4}, journal={Biometrika}, publisher={Oxford University Press (OUP)}, author={Song, R. and Lu, W. and Ma, S. and Jeng, X. (Jessie)}, year={2014}, month={Oct}, pages={799–814} } @article{vardhanabhuti_jeng_wu_li_2014, title={Parametric modeling of whole-genome sequencing data for CNV identification}, volume={15}, ISSN={["1468-4357"]}, DOI={10.1093/biostatistics/kxt060}, abstractNote={Copy number variants (CNVs) constitute an important class of genetic variants in human genome and are shown to be associated with complex diseases. Whole-genome sequencing provides an unbiased way of identifying all the CNVs that an individual carries. In this paper, we consider parametric modeling of the read depth (RD) data from whole-genome sequencing with the aim of identifying the CNVs, including both Poisson and negative-binomial modeling of such count data. We propose a unified approach of using a mean-matching variance stabilizing transformation to turn the relatively complicated problem of sparse segment identification for count data into a sparse segment identification problem for a sequence of Gaussian data. We apply the optimal sparse segment identification procedure to the transformed data in order to identify the CNV segments. This provides a computationally efficient approach for RD-based CNV identification. Simulation results show that this approach often results in a small number of false identifications of the CNVs and has similar or better performances in identifying the true CNVs when compared with other RD-based approaches. We demonstrate the methods using the trio data from the 1000 Genomes Project.}, number={3}, journal={BIOSTATISTICS}, author={Vardhanabhuti, Saran and Jeng, X. Jessie and Wu, Yinghua and Li, Hongzhe}, year={2014}, month={Jul}, pages={427–441} } @article{jeng_cai_li_2013, title={Simultaneous discovery of rare and common segment variants}, volume={100}, ISSN={["1464-3510"]}, DOI={10.1093/biomet/ass059}, abstractNote={Copy number variant is an important type of genetic structural variation appearing in germline DNA, ranging from common to rare in a population. Both rare and common copy number variants have been reported to be associated with complex diseases, so it is therefore important to simultaneously identify both based on a large set of population samples. We develop a proportion adaptive segment selection procedure that automatically adjusts to the unknown proportions of the carriers of the segment variants. We characterize the detection boundary that separates the region where a segment variant is detectable by some method from the region where it cannot be detected. Although the detection boundaries are very different for the rare and common segment variants, it is shown that the proposed procedure can reliably identify both whenever they are detectable. Compared with methods for single sample analysis, this procedure gains power by pooling information from multiple samples. The method is applied to analyze neuroblastoma samples and identifies a large number of copy number variants that are missed by single-sample methods.}, number={1}, journal={BIOMETRIKA}, author={Jeng, X. Jessie and Cai, T. Tony and Li, Hongzhe}, year={2013}, month={Mar}, pages={157–172} } @article{daye_jeng_2009, title={Shrinkage and model selection with correlated variables via weighted fusion}, volume={53}, ISSN={0167-9473}, url={http://dx.doi.org/10.1016/j.csda.2008.11.007}, DOI={10.1016/j.csda.2008.11.007}, abstractNote={In this paper, we propose the weighted fusion, a new penalized regression and variable selection method for data with correlated variables. The weighted fusion can potentially incorporate information redundancy among correlated variables for estimation and variable selection. Weighted fusion is also useful when the number of predictors p is larger than the number of observations n. It allows the selection of more than n variables in a motivated way. Real data and simulation examples show that weighted fusion can improve variable selection and prediction accuracy.}, number={4}, journal={Computational Statistics & Data Analysis}, publisher={Elsevier BV}, author={Daye, Z. John and Jeng, X. Jessie}, year={2009}, month={Feb}, pages={1284–1298} }