@article{wang_zhang_wu_2019, title={Multiclass Probability Estimation With Support Vector Machines}, volume={28}, ISSN={["1537-2715"]}, DOI={10.1080/10618600.2019.1585260}, abstractNote={Abstract Multiclass classification and probability estimation have important applications in data analytics. Support vector machines (SVMs) have shown great success in various real-world problems due to their high classification accuracy. However, one main limitation of standard SVMs is that they do not provide class probability estimates, and thus fail to offer uncertainty measure about class prediction. In this article, we propose a simple yet effective framework to endow kernel SVMs with the feature of multiclass probability estimation. The new probability estimator does not rely on any parametric assumption on the data distribution, therefore, it is flexible and robust. Theoretically, we show that the proposed estimator is asymptotically consistent. Computationally, the new procedure can be conveniently implemented using standard SVM softwares. Our extensive numerical studies demonstrate competitive performance of the new estimator when compared with existing methods such as multiple logistic regression, linear discrimination analysis, tree-based methods, and random forest, under various classification settings. Supplementary materials for this article are available online.}, number={3}, journal={JOURNAL OF COMPUTATIONAL AND GRAPHICAL STATISTICS}, author={Wang, Xin and Zhang, Hao Helen and Wu, Yichao}, year={2019}, month={Jul}, pages={586–595} }
@article{caner_zhang_2014, title={Adaptive Elastic Net for Generalized Methods of Moments}, volume={32}, ISSN={["1537-2707"]}, DOI={10.1080/07350015.2013.836104}, abstractNote={Model selection and estimation are crucial parts of econometrics. This article introduces a new technique that can simultaneously estimate and select the model in generalized method of moments (GMM) context. The GMM is particularly powerful for analyzing complex datasets such as longitudinal and panel data, and it has wide applications in econometrics. This article extends the least squares based adaptive elastic net estimator by Zou and Zhang to nonlinear equation systems with endogenous variables. The extension is not trivial and involves a new proof technique due to estimators’ lack of closed-form solutions. Compared to Bridge-GMM by Caner, we allow for the number of parameters to diverge to infinity as well as collinearity among a large number of variables; also, the redundant parameters are set to zero via a data-dependent technique. This method has the oracle property, meaning that we can estimate nonzero parameters with their standard limit and the redundant parameters are dropped from the equations simultaneously. Numerical examples are used to illustrate the performance of the new method.}, number={1}, journal={JOURNAL OF BUSINESS & ECONOMIC STATISTICS}, author={Caner, Mehmet and Zhang, Hao Helen}, year={2014}, month={Jan}, pages={30–47} }
@article{shin_wu_zhang_2014, title={Two-dimensional solution surface for weighted support vector machines}, volume={23}, DOI={10.1080/10618600.2012.761139}, abstractNote={The support vector machine (SVM) is a popular learning method for binary classification. Standard SVMs treat all the data points equally, but in some practical problems it is more natural to assign different weights to observations from different classes. This leads to a broader class of learning, the so-called weighted SVMs (WSVMs), and one of their important applications is to estimate class probabilities besides learning the classification boundary. There are two parameters associated with the WSVM optimization problem: one is the regularization parameter and the other is the weight parameter. In this article, we first establish that the WSVM solutions are jointly piecewise-linear with respect to both the regularization and weight parameter. We then develop a state-of-the-art algorithm that can compute the entire trajectory of the WSVM solutions for every pair of the regularization parameter and the weight parameter at a feasible computational cost. The derived two-dimensional solution surface provides theoretical insight on the behavior of the WSVM solutions. Numerically, the algorithm can greatly facilitate the implementation of the WSVM and automate the selection process of the optimal regularization parameter. We illustrate the new algorithm on various examples. This article has online supplementary materials.}, number={2}, journal={Journal of Computational and Graphical Statistics}, author={Shin, S. J. and Wu, Y. C. and Zhang, H. H.}, year={2014}, pages={383–402} }
@article{sharma_bondell_zhang_2013, title={Consistent Group Identification and Variable Selection in Regression With Correlated Predictors}, volume={22}, ISSN={["1537-2715"]}, DOI={10.1080/15533174.2012.707849}, abstractNote={Statistical procedures for variable selection have become integral elements in any analysis. Successful procedures are characterized by high predictive accuracy, yielding interpretable models while retaining computational efficiency. Penalized methods that perform coefficient shrinkage have been shown to be successful in many cases. Models with correlated predictors are particularly challenging to tackle. We propose a penalization procedure that performs variable selection while clustering groups of predictors automatically. The oracle properties of this procedure, including consistency in group identification, are also studied. The proposed method compares favorably with existing selection approaches in both prediction accuracy and model discovery, while retaining its computational efficiency. Supplementary materials are available online.}, number={2}, journal={JOURNAL OF COMPUTATIONAL AND GRAPHICAL STATISTICS}, author={Sharma, Dhruv B. and Bondell, Howard D. and Zhang, Hao Helen}, year={2013}, month={Jun}, pages={319–340} }
@article{lu_zhang_zeng_2013, title={Variable selection for optimal treatment decision}, volume={22}, ISSN={["1477-0334"]}, DOI={10.1177/0962280211428383}, abstractNote={In decision-making on optimal treatment strategies, it is of great importance to identify variables that are involved in the decision rule, i.e. those interacting with the treatment. Effective variable selection helps to improve the prediction accuracy and enhance the interpretability of the decision rule. We propose a new penalized regression framework which can simultaneously estimate the optimal treatment strategy and identify important variables. The advantages of the new approach include: (i) it does not require the estimation of the baseline mean function of the response, which greatly improves the robustness of the estimator; (ii) the convenient loss-based framework makes it easier to adopt shrinkage methods for variable selection, which greatly facilitates implementation and statistical inferences for the estimator. The new procedure can be easily implemented by existing state-of-art software packages like LARS. Theoretical properties of the new estimator are studied. Its empirical performance is evaluated using simulation studies and further illustrated with an application to an AIDS clinical trial.}, number={5}, journal={STATISTICAL METHODS IN MEDICAL RESEARCH}, author={Lu, Wenbin and Zhang, Hao Helen and Zeng, Donglin}, year={2013}, month={Oct}, pages={493–504} }
@article{yuan_zhang_davidian_2012, title={Variable selection for covariate-adjusted semiparametric inference in randomized clinical trials}, volume={31}, ISSN={["1097-0258"]}, DOI={10.1002/sim.5433}, abstractNote={Extensive baseline covariate information is routinely collected on participants in randomized clinical trials, and it is well recognized that a proper covariate‐adjusted analysis can improve the efficiency of inference on the treatment effect. However, such covariate adjustment has engendered considerable controversy, as post hoc selection of covariates may involve subjectivity and may lead to biased inference, whereas prior specification of the adjustment may exclude important variables from consideration. Accordingly, how to select covariates objectively to gain maximal efficiency is of broad interest. We propose and study the use of modern variable selection methods for this purpose in the context of a semiparametric framework, under which variable selection in modeling the relationship between outcome and covariates is separated from estimation of the treatment effect, circumventing the potential for selection bias associated with standard analysis of covariance methods. We demonstrate that such objective variable selection techniques combined with this framework can identify key variables and lead to unbiased and efficient inference on the treatment effect. A critical issue in finite samples is validity of estimators of uncertainty, such as standard errors and confidence intervals for the treatment effect. We propose an approach to estimation of sampling variation of estimated treatment effect and show its superior performance relative to that of existing methods. Copyright © 2012 John Wiley & Sons, Ltd.}, number={29}, journal={STATISTICS IN MEDICINE}, author={Yuan, Shuai and Zhang, Hao Helen and Davidian, Marie}, year={2012}, month={Dec}, pages={3789–3804} }
@article{liu_zhang_wu_2011, title={Hard or Soft Classification? Large-Margin Unified Machines}, volume={106}, ISSN={["0162-1459"]}, DOI={10.1198/jasa.2011.tm10319}, abstractNote={Margin-based classifiers have been popular in both machine learning and statistics for classification problems. Among numerous classifiers, some are hard classifiers while some are soft ones. Soft classifiers explicitly estimate the class conditional probabilities and then perform classification based on estimated probabilities. In contrast, hard classifiers directly target the classification decision boundary without producing the probability estimation. These two types of classifiers are based on different philosophies and each has its own merits. In this article, we propose a novel family of large-margin classifiers, namely large-margin unified machines (LUMs), which covers a broad range of margin-based classifiers including both hard and soft ones. By offering a natural bridge from soft to hard classification, the LUM provides a unified algorithm to fit various classifiers and hence a convenient platform to compare hard and soft classification. Both theoretical consistency and numerical performance of LUMs are explored. Our numerical study sheds some light on the choice between hard and soft classifiers in various classification problems.}, number={493}, journal={JOURNAL OF THE AMERICAN STATISTICAL ASSOCIATION}, author={Liu, Yufeng and Zhang, Hao Helen and Wu, Yichao}, year={2011}, month={Mar}, pages={166–177} }
@article{zhang_cheng_liu_2011, title={Linear or Nonlinear? Automatic Structure Discovery for Partially Linear Models}, volume={106}, ISSN={["1537-274X"]}, DOI={10.1198/jasa.2011.tm10281}, abstractNote={Partially linear models provide a useful class of tools for modeling complex data by naturally incorporating a combination of linear and nonlinear effects within one framework. One key question in partially linear models is the choice of model structure, that is, how to decide which covariates are linear and which are nonlinear. This is a fundamental, yet largely unsolved problem for partially linear models. In practice, one often assumes that the model structure is given or known and then makes estimation and inference based on that structure. Alternatively, there are two methods in common use for tackling the problem: hypotheses testing and visual screening based on the marginal fits. Both methods are quite useful in practice but have their drawbacks. First, it is difficult to construct a powerful procedure for testing multiple hypotheses of linear against nonlinear fits. Second, the screening procedure based on the scatterplots of individual covariate fits may provide an educated guess on the regression function form, but the procedure is ad hoc and lacks theoretical justifications. In this article, we propose a new approach to structure selection for partially linear models, called the LAND (Linear And Nonlinear Discoverer). The procedure is developed in an elegant mathematical framework and possesses desired theoretical and computational properties. Under certain regularity conditions, we show that the LAND estimator is able to identify the underlying true model structure correctly and at the same time estimate the multivariate regression function consistently. The convergence rate of the new estimator is established as well. We further propose an iterative algorithm to implement the procedure and illustrate its performance by simulated and real examples. Supplementary materials for this article are available online.}, number={495}, journal={JOURNAL OF THE AMERICAN STATISTICAL ASSOCIATION}, author={Zhang, Hao Helen and Cheng, Guang and Liu, Yufeng}, year={2011}, month={Sep}, pages={1099–1112} }
@article{storlie_bondell_reich_zhang_2011, title={Surface estimation, variable selection, and the nonparametric oracle property}, volume={21}, number={2}, journal={Statistica Sinica}, author={Storlie, C. B. and Bondell, H. D. and Reich, B. J. and Zhang, H. H.}, year={2011}, pages={679–705} }
@article{rahardja_zhao_zhang_2010, title={Bayesian Inference of Odds Ratios in Misclassified Binary Data with a Validation Substudy}, volume={39}, ISSN={["0361-0918"]}, DOI={10.1080/03610918.2010.518271}, abstractNote={We propose a fully Bayesian model with a non-informative prior for analyzing misclassified binary data with a validation substudy. In addition, we derive a closed-form algorithm for drawing all parameters from the posterior distribution and making statistical inference on odds ratios. Our algorithm draws each parameter from a beta distribution, avoids the specification of initial values, and does not have convergence issues. We apply the algorithm to a data set and compare the results with those obtained by other methods. Finally, the performance of our algorithm is assessed using simulation studies.}, number={10}, journal={COMMUNICATIONS IN STATISTICS-SIMULATION AND COMPUTATION}, author={Rahardja, Dewi and Zhao, Yan D. and Zhang, Hao Helen}, year={2010}, pages={1845–1854} }
@article{lu_zhang_2010, title={On Estimation of Partially Linear Transformation Models}, volume={105}, ISSN={["1537-274X"]}, DOI={10.1198/jasa.2010.tm09302}, abstractNote={We study a general class of partially linear transformation models, which extend linear transformation models by incorporating nonlinear covariate effects in survival data analysis. A new martingale-based estimating equation approach, consisting of both global and kernel-weighted local estimation equations, is developed for estimating the parametric and nonparametric covariate effects in a unified manner. We show that with a proper choice of the kernel bandwidth parameter, one can obtain the consistent and asymptotically normal parameter estimates for the linear effects. Asymptotic properties of the estimated nonlinear effects are established as well. We further suggest a simple resampling method to estimate the asymptotic variance of the linear estimates and show its effectiveness. To facilitate the implementation of the new procedure, an iterative algorithm is developed. Numerical examples are given to illustrate the finite-sample performance of the procedure. Supplementary materials are available online.}, number={490}, journal={JOURNAL OF THE AMERICAN STATISTICAL ASSOCIATION}, author={Lu, Wenbin and Zhang, Hao Helen}, year={2010}, month={Jun}, pages={683–691} }
@article{zhang_lu_wang_2010, title={On sparse estimation for semiparametric linear transformation models}, volume={101}, ISSN={["0047-259X"]}, DOI={10.1016/j.jmva.2010.01.015}, abstractNote={Semiparametric linear transformation models have received much attention due to their high flexibility in modeling survival data. A useful estimating equation procedure was recently proposed by Chen et al. (2002) [21] for linear transformation models to jointly estimate parametric and nonparametric terms. They showed that this procedure can yield a consistent and robust estimator. However, the problem of variable selection for linear transformation models has been less studied, partially because a convenient loss function is not readily available under this context. In this paper, we propose a simple yet powerful approach to achieve both sparse and consistent estimation for linear transformation models. The main idea is to derive a profiled score from the estimating equation of Chen et al. [21], construct a loss function based on the profile scored and its variance, and then minimize the loss subject to some shrinkage penalty. Under regularity conditions, we have shown that the resulting estimator is consistent for both model estimation and variable selection. Furthermore, the estimated parametric terms are asymptotically normal and can achieve a higher efficiency than that yielded from the estimation equations. For computation, we suggest a one-step approximation algorithm which can take advantage of the LARS and build the entire solution path efficiently. Performance of the new procedure is illustrated through numerous simulations and real examples including one microarray data.}, number={7}, journal={JOURNAL OF MULTIVARIATE ANALYSIS}, author={Zhang, Hao Helen and Lu, Wenbin and Wang, Hansheng}, year={2010}, month={Aug}, pages={1594–1606} }
@article{wu_zhang_liu_2010, title={Robust Model-Free Multiclass Probability Estimation}, volume={105}, ISSN={["0162-1459"]}, DOI={10.1198/jasa.2010.tm09107}, abstractNote={Classical statistical approaches for multiclass probability estimation are typically based on regression techniques such as multiple logistic regression, or density estimation approaches such as linear discriminant analysis (LDA) and quadratic discriminant analysis (QDA). These methods often make certain assumptions on the form of probability functions or on the underlying distributions of subclasses. In this article, we develop a model-free procedure to estimate multiclass probabilities based on large-margin classifiers. In particular, the new estimation scheme is employed by solving a series of weighted large-margin classifiers and then systematically extracting the probability information from these multiple classification rules. A main advantage of the proposed probability estimation technique is that it does not impose any strong parametric assumption on the underlying distribution and can be applied for a wide range of large-margin classification methods. A general computational algorithm is developed for class probability estimation. Furthermore, we establish asymptotic consistency of the probability estimates. Both simulated and real data examples are presented to illustrate competitive performance of the new approach and compare it with several other existing methods.}, number={489}, journal={JOURNAL OF THE AMERICAN STATISTICAL ASSOCIATION}, author={Wu, Yichao and Zhang, Hao Helen and Liu, Yufeng}, year={2010}, month={Mar}, pages={424–436} }
@article{shows_lu_zhang_2010, title={Sparse estimation and inference for censored median regression}, volume={140}, ISSN={["1873-1171"]}, DOI={10.1016/j.jspi.2010.01.043}, abstractNote={Censored median regression has proved useful for analyzing survival data in complicated situations, say, when the variance is heteroscedastic or the data contain outliers. In this paper, we study the sparse estimation for censored median regression models, which is an important problem for high dimensional survival data analysis. In particular, a new procedure is proposed to minimize an inverse-censoring-probability weighted least absolute deviation loss subject to the adaptive LASSO penalty and result in a sparse and robust median estimator. We show that, with a proper choice of the tuning parameter, the procedure can identify the underlying sparse model consistently and has desired large-sample properties including root-n consistency and the asymptotic normality. The procedure also enjoys great advantages in computation, since its entire solution path can be obtained efficiently. Furthermore, we propose a resampling method to estimate the variance of the estimator. The performance of the procedure is illustrated by extensive simulations and two real data applications including one microarray gene expression survival data.}, number={7}, journal={JOURNAL OF STATISTICAL PLANNING AND INFERENCE}, author={Shows, Justin Hall and Lu, Wenbin and Zhang, Hao Helen}, year={2010}, month={Jul}, pages={1903–1917} }
@article{ni_zhang_zhang_2010, title={Variable Selection for Semiparametric Mixed Models in Longitudinal Studies}, volume={66}, ISSN={["1541-0420"]}, DOI={10.1111/j.1541-0420.2009.01240.x}, abstractNote={Summary We propose a double‐penalized likelihood approach for simultaneous model selection and estimation in semiparametric mixed models for longitudinal data. Two types of penalties are jointly imposed on the ordinary log‐likelihood: the roughness penalty on the nonparametric baseline function and a nonconcave shrinkage penalty on linear coefficients to achieve model sparsity. Compared to existing estimation equation based approaches, our procedure provides valid inference for data with missing at random, and will be more efficient if the specified model is correct. Another advantage of the new procedure is its easy computation for both regression components and variance parameters. We show that the double‐penalized problem can be conveniently reformulated into a linear mixed model framework, so that existing software can be directly used to implement our method. For the purpose of model inference, we derive both frequentist and Bayesian variance estimation for estimated parametric and nonparametric components. Simulation is used to evaluate and compare the performance of our method to the existing ones. We then apply the new method to a real data set from a lactation study.}, number={1}, journal={BIOMETRICS}, author={Ni, Xiao and Zhang, Daowen and Zhang, Hao Helen}, year={2010}, month={Mar}, pages={79–88} }
@article{qiao_zhang_liu_todd_marron_2010, title={Weighted Distance Weighted Discrimination and Its Asymptotic Properties}, volume={105}, ISSN={["0162-1459"]}, DOI={10.1198/jasa.2010.tm08487}, abstractNote={While Distance Weighted Discrimination (DWD) is an appealing approach to classification in high dimensions, it was designed for balanced datasets. In the case of unequal costs, biased sampling, or unbalanced data, there are major improvements available, using appropriately weighted versions of DWD (wDWD). A major contribution of this paper is the development of optimal weighting schemes for various nonstandard classification problems. In addition, we discuss several alternative criteria and propose an adaptive weighting scheme (awDWD) and demonstrate its advantages over nonadaptive weighting schemes under some situations. The second major contribution is a theoretical study of weighted DWD. Both high-dimensional low sample-size asymptotics and Fisher consistency of DWD are studied. The performance of weighted DWD is evaluated using simulated examples and two real data examples. The theoretical results are also confirmed by simulations.}, number={489}, journal={JOURNAL OF THE AMERICAN STATISTICAL ASSOCIATION}, author={Qiao, Xingye and Zhang, Hao Helen and Liu, Yufeng and Todd, Michael J. and Marron, J. S.}, year={2010}, month={Mar}, pages={401–414} }
@article{liu_tang_zhang_2009, title={A new chi-square approximation to the distribution of non-negative definite quadratic forms in non-central normal variables}, volume={53}, ISSN={["1872-7352"]}, DOI={10.1016/j.csda.2008.11.025}, abstractNote={This note proposes a new chi-square approximation to the distribution of non-negative definite quadratic forms in non-central normal variables. The unknown parameters are determined by the first four cumulants of the quadratic forms. The proposed method is compared with Pearson’s three-moment central χ2 approximation approach, by means of numerical examples. Our method yields a better approximation to the distribution of the non-central quadratic forms than Pearson’s method, particularly in the upper tail of the quadratic form, the tail most often needed in practical work.}, number={4}, journal={COMPUTATIONAL STATISTICS & DATA ANALYSIS}, author={Liu, Huan and Tang, Yongqiang and Zhang, Hao Helen}, year={2009}, month={Feb}, pages={853–856} }
@article{ni_zhang_zhang_2009, title={Automatic model selection for partially linear models}, volume={100}, ISSN={["0047-259X"]}, DOI={10.1016/j.jmva.2009.06.009}, abstractNote={We propose and study a unified procedure for variable selection in partially linear models. A new type of double-penalized least squares is formulated, using the smoothing spline to estimate the nonparametric part and applying a shrinkage penalty on parametric components to achieve model parsimony. Theoretically we show that, with proper choices of the smoothing and regularization parameters, the proposed procedure can be as efficient as the oracle estimator (Fan and Li, 2001). We also study the asymptotic properties of the estimator when the number of parametric effects diverges with the sample size. Frequentist and Bayesian estimates of the covariance and confidence intervals are derived for the estimators. One great advantage of this procedure is its linear mixed model (LMM) representation, which greatly facilitates its implementation by using standard statistical software. Furthermore, the LMM framework enables one to treat the smoothing parameter as a variance component and hence conveniently estimate it together with other regression coefficients. Extensive numerical studies are conducted to demonstrate the effective performance of the proposed procedure.}, number={9}, journal={JOURNAL OF MULTIVARIATE ANALYSIS}, author={Ni, Xiao and Zhang, Hao Helen and Zhang, Daowen}, year={2009}, month={Oct}, pages={2100–2111} }
@article{hwang_zhang_ghosal_2009, title={FIRST: Combining forward iterative selection and shrinkage in high dimensional sparse linear regression}, volume={2}, DOI={10.4310/sii.2009.v2.n3.a7}, abstractNote={We propose a new class of variable selection techniques for regression in high dimensional linear models based on a forward selection version of the LASSO, adaptive LASSO or elastic net, respectively to be called as forward iterative regression and shrinkage technique (FIRST), adaptive FIRST and elastic FIRST. These methods seem to work effectively for extremely sparse high dimensional linear models. We exploit the fact that the LASSO, adaptive LASSO and elastic net have closed form solutions when the predictor is onedimensional. The explicit formula is then repeatedly used in an iterative fashion to build the model until convergence occurs. By carefully considering the relationship between estimators at successive stages, we develop fast algorithms to compute our estimators. The performance of our new estimators are compared with commonly used estimators in terms of predictive accuracy and errors in variable selection. AMS 2000 subject classifications: Primary 62J05, 62J05; secondary 62J07.}, number={3}, journal={Statistics and its Interface}, author={Hwang, W. Y. and Zhang, H. H. and Ghosal, S.}, year={2009}, pages={341–348} }
@article{zou_zhang_2009, title={ON THE ADAPTIVE ELASTIC-NET WITH A DIVERGING NUMBER OF PARAMETERS}, volume={37}, ISSN={["0090-5364"]}, DOI={10.1214/08-AOS625}, abstractNote={We consider the problem of model selection and estimation in situations where the number of parameters diverges with the sample size. When the dimension is high, an ideal method should have the oracle property (Fan and Li, 2001; Fan and Peng, 2004) which ensures the optimal large sample performance. Furthermore, the high-dimensionality often induces the collinearity problem which should be properly handled by the ideal method. Many existing variable selection methods fail to achieve both goals simultaneously. In this paper, we propose the adaptive Elastic-Net that combines the strengths of the quadratic regularization and the adaptively weighted lasso shrinkage. Under weak regularity conditions, we establish the oracle property of the adaptive Elastic-Net. We show by simulations that the adaptive Elastic-Net deals with the collinearity problem better than the other oracle-like methods, thus enjoying much improved finite sample performance.}, number={4}, journal={ANNALS OF STATISTICS}, author={Zou, Hui and Zhang, Hao Helen}, year={2009}, month={Aug}, pages={1733–1751} }
@article{bickel_buhlmann_yao_samworth_hall_titterington_xue_anagnostopoulos_tasoullis_zhang_et al._2008, title={Sure independence screening for ultrahigh dimensional feature space discussion}, volume={70}, journal={Journal of the Royal Statistical Society. Series B, Statistical Methodology}, author={Bickel, P. and Buhlmann, P. and Yao, Q. W. and Samworth, R. and Hall, P. and Titterington, D. M. and Xue, J. H. and Anagnostopoulos, C. and Tasoullis, D. K. and Zhang, W. Y. and et al.}, year={2008}, pages={883–911} }
@article{zhang_lu_2007, title={Adaptive lasso for Cox's proportional hazards model}, volume={94}, ISSN={["0006-3444"]}, DOI={10.1093/biomet/asm037}, abstractNote={We investigate the variable selection problem for Cox's proportional hazards model, and propose a unified model selection and estimation procedure with desired theoretical properties and computational convenience. The new method is based on a penalized log partial likelihood with the adaptively weighted L 1 penalty on regression coefficients, providing what we call the adaptive Lasso estimator. The method incorporates different penalties for different coefficients: unimportant variables receive larger penalties than important ones, so that important variables tend to be retained in the selection process, whereas unimportant variables are more likely to be dropped. Theoretical properties, such as consistency and rate of convergence of the estimator, are studied. We also show that, with proper choice of regularization parameters, the proposed estimator has the oracle properties. The convex optimization nature of the method leads to an efficient algorithm. Both simulated and real examples show that the method performs competitively. Copyright 2007, Oxford University Press.}, number={3}, journal={BIOMETRIKA}, author={Zhang, Hao Helen and Lu, Wenbin}, year={2007}, month={Aug}, pages={691–703} }
@article{liu_zhang_park_ahn_2007, title={Support vector machines with adaptive L-q penalty}, volume={51}, ISSN={["1872-7352"]}, DOI={10.1016/j.csda.2007.02.006}, abstractNote={The standard support vector machine (SVM) minimizes the hinge loss function subject to the L2 penalty or the roughness penalty. Recently, the L1 SVM was suggested for variable selection by producing sparse solutions [Bradley, P., Mangasarian, O., 1998. Feature selection via concave minimization and support vector machines. In: Shavlik, J. (Ed.), ICML’98. Morgan Kaufmann, Los Altos, CA; Zhu, J., Hastie, T., Rosset, S., Tibshirani, R., 2003. 1-norm support vector machines. Neural Inform. Process. Systems 16]. These learning methods are non-adaptive since their penalty forms are pre-determined before looking at data, and they often perform well only in a certain type of situation. For instance, the L2 SVM generally works well except when there are too many noise inputs, while the L1 SVM is more preferred in the presence of many noise variables. In this article we propose and explore an adaptive learning procedure called the Lq SVM, where the best q>0 is automatically chosen by data. Both two- and multi-class classification problems are considered. We show that the new adaptive approach combines the benefit of a class of non-adaptive procedures and gives the best performance of this class across a variety of situations. Moreover, we observe that the proposed Lq penalty is more robust to noise variables than the L1 and L2 penalties. An iterative algorithm is suggested to solve the Lq SVM efficiently. Simulations and real data applications support the effectiveness of the proposed procedure.}, number={12}, journal={COMPUTATIONAL STATISTICS & DATA ANALYSIS}, author={Liu, Yufeng and Zhang, Hao Helen and Park, Cheolwoo and Ahn, Jeongyoun}, year={2007}, month={Aug}, pages={6380–6394} }
@article{lu_zhang_2007, title={Variable selection for proportional odds model}, volume={26}, ISSN={["1097-0258"]}, DOI={10.1002/sim.2833}, abstractNote={In this paper we study the problem of variable selection for the proportional odds model, which is a useful alternative to the proportional hazards model and might be appropriate when the proportional hazards assumption is not satisfied. We propose to fit the proportional odds model by maximizing the marginal likelihood subject to a shrinkage-type penalty, which encourages sparse solutions and hence facilitates the process of variable selection. Two types of shrinkage penalties are considered: the LASSO and the adaptive-LASSO (ALASSO) penalty. In the ALASSO penalty, different weights are imposed on different coefficients such that important variables are more protectively retained in the final model while unimportant ones are more likely to be shrunk to zeros. We further provide an efficient computation algorithm to implement the proposed methods, and demonstrate their performance through simulation studies and an application to real data. Numerical results indicate that both methods can produce accurate and interpretable models, and the ALASSO tends to work better than the usual LASSO.}, number={20}, journal={STATISTICS IN MEDICINE}, author={Lu, Wenbin and Zhang, Hao H.}, year={2007}, month={Sep}, pages={3771–3781} }
@article{zhang_lin_2006, title={Component selection and smoothing for nonparametric regression in exponential families}, volume={16}, number={3}, journal={Statistica Sinica}, author={Zhang, H. H. and Lin, Y.}, year={2006}, pages={1021–1041} }
@article{lin_zhang_2006, title={Component selection and smoothing in multivariate nonparametric regression}, volume={34}, ISSN={["0090-5364"]}, DOI={10.1214/009053606000000722}, abstractNote={We propose a new method for model selection and model fitting in multivariate nonparametric regression models, in the framework of smoothing spline ANOVA. The "COSSO" is a method of regularization with the penalty functional being the sum of component norms, instead of the squared norm employed in the traditional smoothing spline method. The COSSO provides a unified framework for several recent proposals for model selection in linear models and smoothing spline ANOVA models. Theoretical properties, such as the existence and the rate of convergence of the COSSO estimator, are studied. In the special case of a tensor product design with periodic functions, a detailed analysis reveals that the COSSO does model selection by applying a novel soft thresholding type operation to the function components. We give an equivalent formulation of the COSSO estimator which leads naturally to an iterative algorithm. We compare the COSSO with MARS, a popular method that builds functional ANOVA models, in simulations and real examples. The COSSO method can be extended to classification problems and we compare its performance with those of a number of machine learning algorithms on real datasets. The COSSO gives very competitive performance in these studies.}, number={5}, journal={ANNALS OF STATISTICS}, author={Lin, Yi and Zhang, Hao Helen}, year={2006}, month={Oct}, pages={2272–2297} }
@article{zhang_ahn_lin_park_2006, title={Gene selection using support vector machines with non-convex penalty}, volume={22}, ISSN={["1460-2059"]}, DOI={10.1093/bioinformatics/bti736}, abstractNote={MOTIVATION
With the development of DNA microarray technology, scientists can now measure the expression levels of thousands of genes simultaneously in one single experiment. One current difficulty in interpreting microarray data comes from their innate nature of 'high-dimensional low sample size'. Therefore, robust and accurate gene selection methods are required to identify differentially expressed group of genes across different samples, e.g. between cancerous and normal cells. Successful gene selection will help to classify different cancer types, lead to a better understanding of genetic signatures in cancers and improve treatment strategies. Although gene selection and cancer classification are two closely related problems, most existing approaches handle them separately by selecting genes prior to classification. We provide a unified procedure for simultaneous gene selection and cancer classification, achieving high accuracy in both aspects.
RESULTS
In this paper we develop a novel type of regularization in support vector machines (SVMs) to identify important genes for cancer classification. A special nonconvex penalty, called the smoothly clipped absolute deviation penalty, is imposed on the hinge loss function in the SVM. By systematically thresholding small estimates to zeros, the new procedure eliminates redundant genes automatically and yields a compact and accurate classifier. A successive quadratic algorithm is proposed to convert the non-differentiable and non-convex optimization problem into easily solved linear equation systems. The method is applied to two real datasets and has produced very promising results.
AVAILABILITY
MATLAB codes are available upon request from the authors.}, number={1}, journal={BIOINFORMATICS}, author={Zhang, HH and Ahn, J and Lin, XD and Park, C}, year={2006}, month={Jan}, pages={88–95} }
@article{leng_zhang_2006, title={Model selection in nonparametric hazard regression}, volume={18}, ISSN={["1029-0311"]}, DOI={10.1080/10485250601027042}, abstractNote={We propose a novel model selection method for a nonparametric extension of the Cox proportional hazard model, in the framework of smoothing splines ANOVA models. The method automates the model building and model selection processes simultaneously by penalizing the reproducing kernel Hilbert space norms. On the basis of a reformulation of the penalized partial likelihood, we propose an efficient algorithm to compute the estimate. The solution demonstrates great flexibility and easy interpretability in modeling relative risk functions for censored data. Adaptive choice of the smoothing parameter is discussed. Both simulations and a real example suggest that our proposal is a useful tool for multivariate function estimation and model selection in survival analysis.}, number={7-8}, journal={JOURNAL OF NONPARAMETRIC STATISTICS}, author={Leng, Chenlei and Zhang, Hao Helen}, year={2006}, pages={417–429} }
@article{tang_zhang_2006, title={Multiclass proximal support vector machines}, volume={15}, ISSN={["1537-2715"]}, DOI={10.1198/106186006X113647}, abstractNote={This article proposes the multiclass proximal support vector machine (MPSVM) classifier, which extends the binary PSVM to the multiclass case. Unlike the one-versus-rest approach that constructs the decision rule based on multiple binary classification tasks, the proposed method considers all classes simultaneously and has better theoretical properties and empirical performance. We formulate the MPSVM as a regularization problem in the reproducing kernel Hilbert space and show that it implements the Bayes rule for classification. In addition, the MPSVM can handle equal and unequal misclassification costs in a unified framework. We suggest an efficient algorithm to implement the MPSVM by solving a system of linear equations. This algorithm requires much less computational effort than solving the standard SVM, which often requires quadratic programming and can be slow for large problems. We also provide an alternative and more robust algorithm for ill-posed problems. The effectiveness of the MPSVM is demonstrated by both simulation studies and applications to cancer classifications using microarray data.}, number={2}, journal={JOURNAL OF COMPUTATIONAL AND GRAPHICAL STATISTICS}, author={Tang, Yongqiang and Zhang, Hao Helen}, year={2006}, month={Jun}, pages={339–355} }
@article{zhang_2006, title={Variable selection for support vector machines via smoothing spline ANOVA}, volume={16}, number={2}, journal={Statistica Sinica}, author={Zhang, H. H.}, year={2006}, pages={659–674} }
@article{ferris_voelker_zhang_2004, title={Model building with likelihood basis pursuit}, volume={19}, ISSN={["1029-4937"]}, DOI={10.1080/1055678042000221719}, abstractNote={We consider a non-parametric penalized likelihood approach for model building called likelihood basis pursuit (LBP) that determines the probabilities of binary outcomes given explanatory vectors while automatically selecting important features. The LBP model involves parameters that balance the competing goals of maximizing the log-likelihood and minimizing the penalized basis pursuit terms. These parameters are selected to minimize a proxy of misclassification error, namely, the randomized, generalized approximate cross validation (ranGACV) function. The ranGACV function is not easily represented in compact form; its functional values can only be obtained by solving two instances of the LBP model, which may be computationally expensive. E-mail: voelker@cs.wisc.edu A grid search is typically used to find appropriate parameters, requiring the solutions to hundreds or thousands of instances of the LBP model. Since only parameters (data) are changed between solves, the resulting problem is a non-linear slice model in the parameter space. We show how slice-modeling techniques significantly improve the efficiency of individual solves and thus speed-up the grid search. In addition, we consider using derivative-free optimization algorithms for parameter selection, replacing the grid search. We show how, by seeding the derivative-free algorithms with a coarse grid search, these algorithms can find better solutions with fewer function evaluations. Our interest in this area comes directly from the seminal work that Olvi and his collaborators have carried out designing and applying optimization techniques to problems in machine learning and data mining. E-mail: hzhang2@stat.ncsu.edu}, number={5}, journal={OPTIMIZATION METHODS & SOFTWARE}, author={Ferris, MC and Voelker, MM and Zhang, HH}, year={2004}, month={Oct}, pages={577–594} }
@article{zhang_wahba_lin_voelker_ferris_klein_klein_2004, title={Variable selection and model building via likelihood basis pursuit}, volume={99}, ISSN={["1537-274X"]}, DOI={10.1198/016214504000000593}, abstractNote={This article presents a nonparametric penalized likelihood approach for variable selection and model building, called likelihood basis pursuit (LBP). In the setting of a tensor product reproducing kernel Hilbert space, we decompose the log-likelihood into the sum of different functional components such as main effects and interactions, with each component represented by appropriate basis functions. Basis functions are chosen to be compatible with variable selection and model building in the context of a smoothing spline ANOVA model. Basis pursuit is applied to obtain the optimal decomposition in terms of having the smallest l1 norm on the coefficients. We use the functional L1 norm to measure the importance of each component and determine the “threshold” value by a sequential Monte Carlo bootstrap test algorithm. As a generalized LASSO-type method, LBP produces shrinkage estimates for the coefficients, which greatly facilitates the variable selection process and provides highly interpretable multivariate functional estimates at the same time. To choose the regularization parameters appearing in the LBP models, generalized approximate cross-validation (GACV) is derived as a tuning criterion. To make GACV widely applicable to large datasets, its randomized version is proposed as well. A technique “slice modeling” is used to solve the optimization problem and makes the computation more efficient. LBP has great potential for a wide range of research and application areas such as medical studies, and in this article we apply it to two large ongoing epidemiologic studies, the Wisconsin Epidemiologic Study of Diabetic Retinopathy (WESDR) and the Beaver Dam Eye Study (BDES).}, number={467}, journal={JOURNAL OF THE AMERICAN STATISTICAL ASSOCIATION}, author={Zhang, HH and Wahba, G and Lin, Y and Voelker, M and Ferris, M and Klein, R and Klein, B}, year={2004}, month={Sep}, pages={659–672} }