@article{dixon_keshavamurthy_farber_stevens_pazdernik_charles_2022, title={A Comparison of Infectious Disease Forecasting Methods across Locations, Diseases, and Time}, volume={11}, ISSN={["2076-0817"]}, DOI={10.3390/pathogens11020185}, abstractNote={Accurate infectious disease forecasting can inform efforts to prevent outbreaks and mitigate adverse impacts. This study compares the performance of statistical, machine learning (ML), and deep learning (DL) approaches in forecasting infectious disease incidences across different countries and time intervals. We forecasted three diverse diseases: campylobacteriosis, typhoid, and Q-fever, using a wide variety of features (n = 46) from public datasets, e.g., landscape, climate, and socioeconomic factors. We compared autoregressive statistical models to two tree-based ML models (extreme gradient boosted trees [XGB] and random forest [RF]) and two DL models (multi-layer perceptron and encoder–decoder model). The disease models were trained on data from seven different countries at the region-level between 2009–2017. Forecasting performance of all models was assessed using mean absolute error, root mean square error, and Poisson deviance across Australia, Israel, and the United States for the months of January through August of 2018. The overall model results were compared across diseases as well as various data splits, including country, regions with highest and lowest cases, and the forecasted months out (i.e., nowcasting, short-term, and long-term forecasting). Overall, the XGB models performed the best for all diseases and, in general, tree-based ML models performed the best when looking at data splits. There were a few instances where the statistical or DL models had minutely smaller error metrics for specific subsets of typhoid, which is a disease with very low case counts. Feature importance per disease was measured by using four tree-based ML models (i.e., XGB and RF with and without region name as a feature). The most important feature groups included previous case counts, region name, population counts and density, mortality causes of neonatal to under 5 years of age, sanitation factors, and elevation. This study demonstrates the power of ML approaches to incorporate a wide range of factors to forecast various diseases, regardless of location, more accurately than traditional statistical approaches.}, number={2}, journal={PATHOGENS}, author={Dixon, Samuel and Keshavamurthy, Ravikiran and Farber, Daniel H. and Stevens, Andrew and Pazdernik, Karl T. and Charles, Lauren E.}, year={2022}, month={Feb} } @article{bakerman_pazdernik_korkmaz_wilson_2022, title={Dynamic logistic regression and variable selection: Forecasting and contextualizing civil unrest}, volume={38}, ISSN={["1872-8200"]}, url={https://doi.org/10.1016/j.ijforecast.2021.07.003}, DOI={10.1016/j.ijforecast.2021.07.003}, abstractNote={Civil unrest can range from peaceful protest to violent furor, and researchers are working to monitor, forecast, and assess such events to allocate resources better. Twitter has become a real-time data source for forecasting civil unrest because millions of people use the platform as a social outlet. Daily word counts are used as model features, and predictive terms contextualize the reasons for the protest. To forecast civil unrest and infer the reasons for the protest, we consider the problem of Bayesian variable selection for the dynamic logistic regression model and propose using penalized credible regions to select parameters of the updated state vector. This method avoids the need for shrinkage priors, is scalable to high-dimensional dynamic data, and allows the importance of variables to vary in time as new information becomes available. A substantial improvement in both precision and F1-score using this approach is demonstrated through simulation. Finally, we apply the proposed model fitting and variable selection methodology to the problem of forecasting civil unrest in Latin America. Our dynamic logistic regression approach shows improved accuracy compared to the static approach currently used in event prediction and feature selection.}, number={2}, journal={INTERNATIONAL JOURNAL OF FORECASTING}, publisher={Elsevier BV}, author={Bakerman, Jordan and Pazdernik, Karl and Korkmaz, Gizem and Wilson, Alyson G.}, year={2022}, pages={648–661} } @article{keshavamurthy_dixon_pazdernik_charles_2022, title={Predicting infectious disease for biopreparedness and response: A systematic review of machine learning and deep learning approaches}, volume={15}, ISSN={["2352-7714"]}, DOI={10.1016/j.onehlt.2022.100439}, abstractNote={The complex, unpredictable nature of pathogen occurrence has required substantial efforts to accurately predict infectious diseases (IDs). With rising popularity of Machine Learning (ML) and Deep Learning (DL) techniques combined with their unique ability to uncover connections between large amounts of diverse data, we conducted a PRISMA systematic review to investigate advances in ID prediction for human and animal diseases using ML and DL. This review included the type of IDs modeled, ML and DL techniques utilized, geographical distribution, prediction tasks performed, input features utilized, spatial and temporal scales, error metrics used, computational efficiency, uncertainty quantification, and missing data handling methods. Among 237 relevant articles published between January 2001 and May 2021, highly contagious diseases in humans were most often represented, including COVID-19 (37.1%), influenza/influenza-like illnesses (9.3%), dengue (8.9%), and malaria (5.1%). Out of 37 diseases identified, 51.4% were zoonotic, 37.8% were human-only, and 8.1% were animal-only, with only 1.6% economically significant, non-zoonotic livestock diseases. Despite the number of zoonoses, 86.5% of articles modeled humans whereas only a few articles (5.1%) contained more than one host species. Eastern Asia (32.5%), North America (17.7%), and Southern Asia (13.1%) were the most represented locations. Frequent approaches included tree-based ML (38.4%) and feed-forward neural networks (26.6%). Articles predicted temporal incidence (66.7%), disease risk (38.0%), and/or spatial movement (31.2%). Less than 10% of studies addressed uncertainty quantification, computational efficiency, and missing data, which are essential to operational use and deployment. This study highlights trends and gaps in ML and DL for ID prediction, providing guidelines for future works to better support biopreparedness and response. To fully utilize ML and DL for improved ID forecasting, models should include the full disease ecology in a One-Health context, important food and agricultural diseases, underrepresented hotspots, and important metrics required for operational deployment.}, journal={ONE HEALTH}, author={Keshavamurthy, Ravikiran and Dixon, Samuel and Pazdernik, Karl T. and Charles, Lauren E.}, year={2022}, month={Dec} } @article{pazdernik_maitra_2021, title={Estimating basis functions in massive fields under the spatial mixed effects model}, ISSN={["1932-1872"]}, DOI={10.1002/sam.11537}, abstractNote={Spatial prediction is commonly achieved under the assumption of a Gaussian random field by obtaining maximum likelihood estimates of parameters, and then using the kriging equations to arrive at predicted values. For massive datasets, fixed rank kriging using the expectation–maximization algorithm for estimation has been proposed as an alternative to the usual but computationally prohibitive kriging method. The method reduces computation cost of estimation by redefining the spatial process as a linear combination of basis functions and spatial random effects. A disadvantage of this method is that it imposes constraints on the relationship between the observed locations and the knots. We develop an alternative method that utilizes the spatial mixed effects model, but allows for additional flexibility by estimating the range of the spatial dependence between the observations and the knots via an alternating expectation conditional maximization algorithm. Experiments show that our methodology improves estimation without sacrificing prediction accuracy while also minimizing the additional computational burden of extra parameter estimation. The methodology is applied to a temperature dataset archived by the United States National Climate Data Center, with improved results over previous methodology.}, journal={STATISTICAL ANALYSIS AND DATA MINING}, author={Pazdernik, Karl and Maitra, Ranjan}, year={2021}, month={Jul} } @article{pazdernik_lahaye_artman_zhu_2020, title={Microstructural classification of unirradiated LiAlO2 pellets by deep learning methods}, volume={181}, ISSN={["1879-0801"]}, DOI={10.1016/j.commatsci.2020.109728}, abstractNote={Microstructural features and defects can greatly impact material properties and performance in a wide range of application areas. Recognition and characterization of microstructural features is essential to the understanding and prediction of material performance under various operational conditions, including irradiation. In this work, we tested a collection of Deep Convolutional Neural Network (DCNN) architectures that have been optimized for image segmentation and selected the best performer to obtain pixel-level classification of the main microstructural features in unirradiated LiAlO2 pellets, including grains, grain boundaries, voids, precipitates, and zirconia impurities. LiAlO2 is an important material that is used as a tritium producer for the Tritium Sustainment Program. While LiAlO2 pellets have been employed in tritium-producing burnable absorber rods (TPBARs) for years, comprehensive microstructural analysis of unirradiated LiAlO2, and therefore time-dependent tritium release from the material during irradiation, has not been established. A full understanding of unirradiated LiAlO2 microstructure and how it evolves as a result of neutron irradiation is necessary to produce an integrated performance model to predict in-reactor behavior as well as to target strategic experiments. This work aims at developing a fast and quantitative analysis method to classify various microstructural features in unirradiated LiAlO2 pellets that are visualized by scanning electron microscopy (SEM). Given classification results obtained, statistical analysis was then carried out to evaluate the performance of the DCNN classification and to describe the properties of the microstructural features as a whole, based on standard aggregation and spatial point-process methodology. Our results show improved performance over a baseline heuristic approach. Also, the computational efficiency of the computer-aided analytical method allows for quantitative characterization of a larger volume of SEM images than was previously possible using manual segmentation.}, journal={COMPUTATIONAL MATERIALS SCIENCE}, author={Pazdernik, Karl and LaHaye, Nicole L. and Artman, Conor M. and Zhu, Yuanyuan}, year={2020}, month={Aug} } @article{bakerman_pazdernik_wilson_fairchild_bahran_2018, title={Twitter geolocation: A hybrid approach}, volume={12}, number={3}, journal={ACM Transactions on Knowledge Discovery from Data}, author={Bakerman, J. and Pazdernik, K. and Wilson, A. and Fairchild, G. and Bahran, R.}, year={2018} }