@article{hoq_shi_leinonen_babalola_lynch_price_akram_2024, title={Detecting ChatGPT-Generated Code Submissions in a CS1 Course Using Machine Learning Models}, url={https://doi.org/10.1145/3626252.3630826}, DOI={10.1145/3626252.3630826}, abstractNote={The emergence of publicly accessible large language models (LLMs) such as ChatGPT poses unprecedented risks of new types of plagiarism and cheating where students use LLMs to solve exercises for them. Detecting this behavior will be a necessary component in introductory computer science (CS1) courses, and educators should be well-equipped with detection tools when the need arises. However, ChatGPT generates code non-deterministically, and thus, traditional similarity detectors might not suffice to detect AI-created code. In this work, we explore the affordances of Machine Learning (ML) models for the detection task. We used an openly available dataset of student programs for CS1 assignments and had ChatGPT generate code for the same assignments, and then evaluated the performance of both traditional machine learning models and Abstract Syntax Tree-based (AST-based) deep learning models in detecting ChatGPT code from student code submissions. Our results suggest that both traditional machine learning models and AST-based deep learning models are effective in identifying ChatGPT-generated code with accuracy above 90%. Since the deployment of such models requires ML knowledge and resources that are not always accessible to instructors, we also explore the patterns detected by deep learning models that indicate possible ChatGPT code signatures, which instructors could possibly use to detect LLM-based cheating manually. We also explore whether explicitly asking ChatGPT to impersonate a novice programmer affects the code produced. We further discuss the potential applications of our proposed models for enhancing introductory computer science instruction.}, journal={PROCEEDINGS OF THE 55TH ACM TECHNICAL SYMPOSIUM ON COMPUTER SCIENCE EDUCATION, SIGCSE 2024, VOL. 1}, author={Hoq, Muntasir and Shi, Yang and Leinonen, Juho and Babalola, Damilola and Lynch, Collin and Price, Thomas and Akram, Bita}, year={2024}, pages={526–532} } @article{hoq_chilla_ranjbar_brusilovsky_akram_2023, title={SANN: Programming Code Representation Using Attention Neural Network with Optimized Subtree Extraction}, DOI={10.1145/3583780.3615047}, abstractNote={Automated analysis of programming data using code representation methods offers valuable services for programmers, from code completion to clone detection to bug detection. Recent studies show the effectiveness of Abstract Syntax Trees (AST), pre-trained Transformer-based models, and graph-based embeddings in programming code representation. However, pre-trained large language models lack interpretability, while other embedding-based approaches struggle with extracting important information from large ASTs. This study proposes a novel Subtree-based Attention Neural Network (SANN) to address these gaps by integrating different components: an optimized sequential subtree extraction process using Genetic algorithm optimization, a two-way embedding approach, and an attention network. We investigate the effectiveness of SANN by applying it to two different tasks: program correctness prediction and algorithm detection on two educational datasets containing both small and large-scale code snippets written in Java and C, respectively. The experimental results show SANN's competitive performance against baseline models from the literature, including code2vec, ASTNN, TBCNN, CodeBERT, GPT-2, and MVG, regarding accurate predictive power. Finally, a case study is presented to show the interpretability of our model prediction and its application for an important human-centered computing application, student modeling. Our results indicate the effectiveness of the SANN model in capturing important syntactic and semantic information from students' code, allowing the construction of accurate student models, which serve as the foundation for generating adaptive instructional support such as individualized hints and feedback.}, journal={PROCEEDINGS OF THE 32ND ACM INTERNATIONAL CONFERENCE ON INFORMATION AND KNOWLEDGE MANAGEMENT, CIKM 2023}, author={Hoq, Muntasir and Chilla, Sushanth Reddy and Ranjbar, Melika Ahmadi and Brusilovsky, Peter and Akram, Bita}, year={2023}, pages={783–792} }