@article{plyler_chi_2025, title={Iterative Counterfactual Data Augmentation}, DOI={10.1609/aaai.v39i19.34195}, abstractNote={Counterfactual data augmentation (CDA) is a method for controlling information or biases in training datasets by generating a complementary dataset with typically opposing biases. Prior work often either relies on hand-crafted rules or algorithmic CDA methods which can leave unwanted information in the augmented dataset. In this work, we show iterative CDA (ICDA) with initial, high-noise interventions can converge to a state with significantly lower noise. Our ICDA procedure produces a dataset where one target signal in the training dataset maintains high mutual information with a corresponding label and the information of spurious signals are reduced. We show training on the augmented datasets produces rationales on documents that better align with human annotation. Our experiments include six human produced datasets and two large-language model generated datasets.}, journal={Proceedings of the AAAI Conference on Artificial Intelligence}, author={Plyler, Mitchell and Chi, Min}, year={2025}, month={Apr} } @article{rus_ivy_chi_plyler_wells-gray_mayorga_2025, title={Predicting patient enrollment in a telephone-based principal care management service using topic modeling}, volume={4}, DOI={10.1371/journal.pdig.0000992}, abstractNote={Diabetic Retinopathy (DR) is a complication related to diabetes that can lead to vision impairment. To assist DR patients, a care management company provides a telephone-based principal care management (PCM) service, which includes care coaching and other services to reduce barriers to care for patients with DR. Despite its benefits, enrollment in the program is suboptimal. This study developed predictive models using call transcripts to investigate factors associated with patient enrollment in the PCM service. We analyzed transcripts of calls made during the enrollment process (prior to enrollment) and feature-engineered the call metadata (i.e., transcript length, number of calls, time between calls, customer and agent sentiment). In addition, we extracted topics discussed in the transcripts using Structural Topic Modeling (STM) and converted them into vector representations. Utilizing call metadata alongside topics, we developed three classification models (call metadata, topic-based, and topic+metadata) to predict patient enrollment, with the latter demonstrating superior performance. The topic+metadata classification model outperformed the other two models in distinguishing between patient enrollment and non-enrollment, with AUC values ranging from 0.81 to 0.99 across models using 3 to 15-topics. The findings suggest that proactively offering to schedule an appointment after the program benefits explanation leads to a higher odds of enrollment. When the scheduling portion of the conversation is not considered, agents should cover all parts of the script over multiple calls. Additionally, agents who explain the program and maintain longer intervals between calls have higher odds of patient enrollment, suggesting that there is value in allowing patients adequate time to reflect between calls. These findings offer valuable insights for agents to evaluate their strategies in patient enrollment. As the first point of contact, enrollment agents play a crucial role in determining whether patients can benefit from care coordination and management programs.}, number={9}, journal={PLOS Digital Health}, author={Rus, Annisa Marlin Masbar and Ivy, Julie S. and Chi, Min and Plyler, Mitchell and Wells-Gray, Elaine and Mayorga, Maria E.}, year={2025}, month={Sep} }