@article{sun_wei_2011, title={Multiple Testing for Pattern Identification, With Applications to Microarray Time-Course Experiments}, volume={106}, ISSN={["0162-1459"]}, DOI={10.1198/jasa.2011.ap09587}, abstractNote={In time-course experiments, it is often desirable to identify genes that exhibit a specific pattern of differential expression over time and thus gain insights into the mechanisms of the underlying biological processes. Two challenging issues in the pattern identification problem are: (i) how to combine the simultaneous inferences across multiple time points and (ii) how to control the multiplicity while accounting for the strong dependence. We formulate a compound decision-theoretic framework for set-wise multiple testing and propose a data-driven procedure that aims to minimize the missed set rate subject to a constraint on the false set rate. The hidden Markov model proposed in Yuan and Kendziorski (2006) is generalized to capture the temporal correlation in the gene expression data. Both theoretical and numerical results are presented to show that our data-driven procedure controls the multiplicity, provides an optimal way of combining simultaneous inferences across multiple time points, and greatly improves the conventional combined p-value methods. In particular, we demonstrate our method in an application to a study of systemic inflammation in humans for detecting early and late response genes.}, number={493}, journal={JOURNAL OF THE AMERICAN STATISTICAL ASSOCIATION}, author={Sun, Wenguang and Wei, Zhi}, year={2011}, month={Mar}, pages={73–88} } @article{sun_joffe_chen_brunelli_2010, title={Design and Analysis of Multiple Events Case-Control Studies}, volume={66}, ISSN={["1541-0420"]}, DOI={10.1111/j.1541-0420.2009.01369.x}, abstractNote={Summary In case–control research where there are multiple case groups, standard analyses fail to make use of all available information. Multiple events case–control (MECC) studies provide a new approach to sampling from a cohort and are useful when it is desired to study multiple types of events in the cohort. In this design, subjects in the cohort who develop any event of interest are sampled, as well as a fraction of the remaining subjects. We show that a simple case–control analysis of data arising from MECC studies is biased and develop three general estimating‐equation‐based approaches to analyzing data from these studies. We conduct simulation studies to compare the efficiency of the various MECC analyses with each other and with the corresponding conventional analyses. It is shown that the gain in efficiency by using the new design is substantial in many situations. We demonstrate the application of our approach to a nested case–control study of the effect of oral sodium phosphate use on chronic kidney injury with multiple case definitions.}, number={4}, journal={BIOMETRICS}, author={Sun, Wenguang and Joffe, Marshall M. and Chen, Jinbo and Brunelli, Steven M.}, year={2010}, month={Dec}, pages={1220–1229} } @inproceedings{wang_wei_sun_2010, title={Simultaneous set-wise testing under dependence, with applications to genome-wide association studies}, volume={3}, DOI={10.4310/sii.2010.v3.n4.a8}, abstractNote={We consider the problem of identifying diseaseassociated genomic regions in genome-wide association studies (GWAS). It is shown that conventional single SNP analysis can be greatly improved by (i) exploiting the spatial dependency and (ii) conducing set-wise analysis. The SNP set association problem can be conceptualized as the problem of simultaneously testing a large number of sets of hypotheses. We use hidden Markov models to exploit the linkage disequilibrium information in GWAS data, based on which a data-driven screening procedure (GLIS) is proposed. GLIS is shown to be optimal in the sense that it has the smallest missed set rate (MSR) among all valid false set rate (FSR) procedures. The numerical results demonstrate that the proposed procedure controls the FSR at the desired level, enjoys certain optimality properties and outperforms conventional combined p-value methods. We apply the GLIS procedure to analyze a Type 1 diabetes (T1D) GWAS dataset for detecting T1D associated genomic regions. The results show that our proposed SNP set analysis not only provides better biological insights, but also increases the statistical power by pooling information from different samples.}, number={4}, booktitle={Statistics and its Interface}, author={Wang, W. and Wei, Z. and Sun, W. G.}, year={2010}, pages={501–511} } @article{wei_sun_wang_hakonarson_2009, title={Multiple testing in genome-wide association studies via hidden Markov models}, volume={25}, ISSN={["1460-2059"]}, DOI={10.1093/bioinformatics/btp476}, abstractNote={Abstract}, number={21}, journal={BIOINFORMATICS}, author={Wei, Zhi and Sun, Wenguang and Wang, Kai and Hakonarson, Hakon}, year={2009}, month={Nov}, pages={2802–2808} }