@article{li_gibson_2023, title={Predicting Groundwater PFOA Exposure Risks with Bayesian Networks: Empirical Impact of Data Preprocessing on Model Performance}, ISSN={["1520-5851"]}, DOI={10.1021/acs.est.3c00348}, abstractNote={The plethora of data on PFASs in environmental samples collected in response to growing concern about these chemicals could enable the training of machine-learning models for predicting exposure risks. However, differences in sampling and analysis methods across data sets must be reconciled through data preprocessing, and little information is available about how such manipulations affect the resulting models. This study evaluates how data preprocessing influences machine-learned Bayesian network models of PFOA in groundwater. We link 19 years of PFOA measurements from Minnesota, USA, to publicly available information about potential PFOA sources and factors that may influence their environmental fate. Nine different preprocessing methods were tested, and the resulting data sets were used to train models to predict the probability of PFOA ≥ 35 ppt, the 2017 Minnesota health advisory level. Different preprocessing approaches produced varying model structures with significantly different accuracies. Nonetheless, models showed similar relationships between predictor variables and PFOA exposure risks, and all models were relatively accurate, distinguishing wells at high risk from those at low risk for 82.0% to 89.0% of test data samples. There was a trade-off between data quality and model performance since a stricter data screening strategy decreased the sample size for model training.}, journal={ENVIRONMENTAL SCIENCE & TECHNOLOGY}, author={Li, Runwei and Gibson, Jacqueline MacDonald}, year={2023}, month={Aug} } @article{li_gibson_2022, title={Predicting the occurrence of short-chain PFAS in groundwater using machine-learned Bayesian networks}, volume={10}, ISSN={["2296-665X"]}, DOI={10.3389/fenvs.2022.958784}, abstractNote={In the past two decades, global manufacturing of per- and polyfluoroalkyl substances (PFAS) has shifted from long-chain compounds to short-chain alternatives in response to evidence of the health hazards of long-chain formulations. However, accumulating data indicate that short-chain PFAS also pose health risks and are highly mobile and persistent in the environment. Because short-chain PFAS are relatively new chemicals, comprehensive knowledge needed to predict their environmental fate is lacking. This study evaluated the capacity of machine-learned Bayesian networks (BNs) to predict risks of exposure to short-chain PFAS in a Minnesota region affected by PFAS releases from the 3M Cottage Grove facility. Models were trained using long-term monitoring data provided by the Minnesota Department of Health (n = 12,406), which we coupled to a comprehensive dataset created by curating 88 other variables that describe potential PFAS sources, soil and hydrogeologic characteristics, and land use. Model performance was assessed using the area under the receiver-operating characteristic curve (AUC), a common measure of the accuracy of machine-learned classification algorithms. In addition, exposure risks were visualized spatially by coupling model predictions to a geographic information system. We found that machine-learned BN models had robust predictive performance, with AUCs above 0.96 in cross-validation. Significant risk factors identified by the BNs include distance to the 3M factory, distance to a former landfill, and areal extent of wetlands and developed land. We also found that risks of exposure to and the areal extent of perfluorosulfonic acids were greater than for perfluorocarboxylic acids with the same carbon number. The results suggest that machine-learned BNs could provide a promising screening tool for assessing short-chain PFAS exposure risks in groundwater.}, journal={FRONTIERS IN ENVIRONMENTAL SCIENCE}, author={Li, Runwei and Gibson, Jacqueline MacDonald}, year={2022}, month={Nov} }