@article{zhou_saghapour_2021, title={ImputEHR: A Visualization Tool of Imputation for the Prediction of Biomedical Data}, volume={12}, ISSN={["1664-8021"]}, DOI={10.3389/fgene.2021.691274}, abstractNote={Electronic health records (EHRs) have been widely adopted in recent years, but often include a high proportion of missing data, which can create difficulties in implementing machine learning and other tools of personalized medicine. Completed datasets are preferred for a number of analysis methods, and successful imputation of missing EHR data can improve interpretation and increase our power to predict health outcomes. However, use of the most popular imputation methods mainly require scripting skills, and are implemented using various packages and syntax. Thus, the implementation of a full suite of methods is generally out of reach to all except experienced data scientists. Moreover, imputation is often considered as a separate exercise from exploratory data analysis, but should be considered as art of the data exploration process. We have created a new graphical tool, ImputEHR, that is based on a Python base and allows implementation of a range of simple and sophisticated (e.g., gradient-boosted tree-based and neural network) data imputation approaches. In addition to imputation, the tool enables data exploration for informed decision-making, as well as implementing machine learning prediction tools for response data selected by the user. Although the approach works for any missing data problem, the tool is primarily motivated by problems encountered for EHR and other biomedical data. We illustrate the tool using multiple real datasets, providing performance measures of imputation and downstream predictive analysis.}, journal={FRONTIERS IN GENETICS}, author={Zhou, Yi-Hui and Saghapour, Ehsan}, year={2021}, month={Jul} } @article{gallins_saghapour_zhou_2020, title={Exploring the Limits of Combined Image/'omics Analysis for Non-cancer Histological Phenotypes}, volume={11}, ISSN={["1664-8021"]}, DOI={10.3389/fgene.2020.555886}, abstractNote={The last several years have witnessed an explosion of methods and applications for combining image data with 'omics data, and for prediction of clinical phenotypes. Much of this research has focused on cancer histology, for which genetic perturbations are large, and the signal to noise ratio is high. Related research on chronic, complex diseases is limited by tissue sample availability, lower genomic signal strength, and the less extreme and tissue-specific nature of intermediate histological phenotypes. Data from the GTEx Consortium provides a unique opportunity to investigate the connections among phenotypic histological variation, imaging data, and 'omics profiling, from multiple tissue-specific phenotypes at the sub-clinical level. Investigating histological designations in multiple tissues, we survey the evidence for genomic association and prediction of histology, and use the results to test the limits of prediction accuracy using machine learning methods applied to the imaging data, genomics data, and their combination. We find that expression data has similar or superior accuracy for pathology prediction as our use of imaging data, despite the fact that pathological determination is made from the images themselves. A variety of machine learning methods have similar performance, while network embedding methods offer at best limited improvements. These observations hold across a range of tissues and predictor types. The results are supportive of the use of genomic measurements for prediction, and in using the same target tissue in which pathological phenotyping has been performed. Although this last finding is sensible, to our knowledge our study is the first to demonstrate this fact empirically. Even while prediction accuracy remains a challenge, the results show clear evidence of pathway and tissue-specific biology.}, journal={FRONTIERS IN GENETICS}, author={Gallins, Paul and Saghapour, Ehsan and Zhou, Yi-Hui}, year={2020}, month={Oct} }