@article{he_zhou_xu_zhang_kim_yang_thung_irsan_lo_2024, title={Representation Learning for Stack Overflow Posts: How Far Are We?}, volume={33}, ISSN={["1557-7392"]}, DOI={10.1145/3635711}, abstractNote={The tremendous success of Stack Overflow has accumulated an extensive corpus of software engineering knowledge, thus motivating researchers to propose various solutions for analyzing its content. The performance of such solutions hinges significantly on the selection of representation models for Stack Overflow posts. As the volume of literature on Stack Overflow continues to burgeon, it highlights the need for a powerful Stack Overflow post representation model and drives researchers’ interest in developing specialized representation models that can adeptly capture the intricacies of Stack Overflow posts. The state-of-the-art (SOTA) Stack Overflow post representation models are Post2Vec and BERTOverflow, which are built upon neural networks such as convolutional neural network and transformer architecture (e.g., BERT). Despite their promising results, these representation methods have not been evaluated in the same experimental setting. To fill the research gap, we first empirically compare the performance of the representation models designed specifically for Stack Overflow posts (Post2Vec and BERTOverflow) in a wide range of related tasks (i.e., tag recommendation, relatedness prediction, and API recommendation). The results show that Post2Vec cannot further improve the SOTA techniques of the considered downstream tasks, and BERTOverflow shows surprisingly poor performance. To find more suitable representation models for the posts, we further explore a diverse set of transformer-based models, including (1) general domain language models (RoBERTa, Longformer, and GPT2) and (2) language models built with software engineering related textual artifacts (CodeBERT, GraphCodeBERT, seBERT, CodeT5, PLBart, and CodeGen). This exploration shows that models like CodeBERT and RoBERTa are suitable for representing Stack Overflow posts. However, it also illustrates the “No Silver Bullet” concept, as none of the models consistently wins against all the others. Inspired by the findings, we propose SOBERT, which employs a simple yet effective strategy to improve the representation models of Stack Overflow posts by continuing the pre-training phase with the textual artifact from Stack Overflow. The overall experimental results demonstrate that SOBERT can consistently outperform the considered models and increase the SOTA performance significantly for all the downstream tasks.}, number={3}, journal={ACM TRANSACTIONS ON SOFTWARE ENGINEERING AND METHODOLOGY}, author={He, Junda and Zhou, Xin and Xu, Bowen and Zhang, Ting and Kim, Kisub and Yang, Zhou and Thung, Ferdian and Irsan, Ivana Clairine and Lo, David}, year={2024}, month={Mar} } @article{yang_xu_zhang_kang_shi_he_lo_2024, title={Stealthy Backdoor Attack for Code Models}, volume={50}, ISSN={["1939-3520"]}, DOI={10.1109/TSE.2024.3361661}, abstractNote={Code models, such as CodeBERT and CodeT5, offer general-purpose representations of code and play a vital role in supporting downstream automated software engineering tasks. Most recently, code models were revealed to be vulnerable to backdoor attacks. A code model that is backdoor-attacked can behave normally on clean examples but will produce pre-defined malicious outputs on examples injected with triggers that activate the backdoors. Existing backdoor attacks on code models use unstealthy and easy-to-detect triggers. This paper aims to investigate the vulnerability of code models with stealthy backdoor attacks. To this end, we propose A fraidoor ( A dversarial F eature as A daptive Back door ). A fraidoor achieves stealthiness by leveraging adversarial perturbations to inject adaptive triggers into different inputs. We apply A fraidoor to three widely adopted code models (CodeBERT, PLBART, and CodeT5) and two downstream tasks (code summarization and method name prediction). We evaluate three widely used defense methods and find that A fraidoor is more unlikely to be detected by the defense methods than by baseline methods. More specifically, when using spectral signature as defense, around 85% of adaptive triggers in A fraidoor bypass the detection in the defense process. By contrast, only less than 12% of the triggers from previous work bypass the defense. When the defense method is not applied, both A fraidoor and baselines have almost perfect attack success rates. However, once a defense is applied, the attack success rates of baselines decrease dramatically, while the success rate of A fraidoor remains high. Our finding exposes security weaknesses in code models under stealthy backdoor attacks and shows that state-of-the-art defense methods cannot provide sufficient protection. We call for more research efforts in understanding security threats to code models and developing more effective countermeasures.}, number={4}, journal={IEEE TRANSACTIONS ON SOFTWARE ENGINEERING}, author={Yang, Zhou and Xu, Bowen and Zhang, Jie M. and Kang, Hong Jin and Shi, Jieke and He, Junda and Lo, David}, year={2024}, month={Apr}, pages={721–741} } @article{xu_nguyen_le-cong_hoang_liu_kim_gong_niu_wang_le_et al._2023, title={Are We Ready to Embrace Generative AI for Software Q&A?}, ISSN={["1527-1366"]}, DOI={10.1109/ASE56229.2023.00023}, abstractNote={Stack Overflow, the world's largest software Q&A (SQA) website, is facing a significant traffic drop due to the emergence of generative AI techniques. ChatGPT is banned by Stack Overflow after only 6 days from its release. The main reason provided by the official Stack Overflow is that the answers generated by ChatGPT are of low quality. To verify this, we conduct a comparative evaluation of human-written and ChatGPT-generated answers. Our methodology employs both automatic comparison and a manual study. Our results suggest that human-written and ChatGPT-generated answers are semantically similar, however, human-written answers outperform ChatGPT-generated ones consistently across multiple aspects, specifically by 10% on the overall score. We release the data, analysis scripts, and detailed results at https://github.com/maxxbw54/GAI4SQA.}, journal={2023 38TH IEEE/ACM INTERNATIONAL CONFERENCE ON AUTOMATED SOFTWARE ENGINEERING, ASE}, author={Xu, Bowen and Nguyen, Thanh-Dat and Le-Cong, Thanh and Hoang, Thong and Liu, Jiakun and Kim, Kisub and Gong, Chen and Niu, Changan and Wang, Chenyu and Le, Bach and et al.}, year={2023}, pages={1713–1717} } @article{zhou_xu_han_yang_he_lo_2023, title={CCBERT: Self-Supervised Code Change Representation Learning}, ISSN={["1063-6773"]}, DOI={10.1109/ICSME58846.2023.00028}, abstractNote={Numerous code changes are made by developers in their daily work, and a superior representation of code changes is desired for effective code change analysis. Recently, Hoang et al. proposed CC2Vec, a neural network-based approach that learns a distributed representation of code changes to capture the semantic intent of the changes. Despite demonstrated effectiveness in multiple tasks, CC2Vec has several limitations: 1) it considers only coarse-grained information about code changes, and 2) it relies on log messages rather than the self-contained content of the code changes. In this work, we propose CCBERT (Code Change BERT), a new Transformer-based pre-trained model that learns a generic representation of code changes based on a large-scale dataset containing massive unlabeled code changes. CCBERT is pre-trained on four proposed self-supervised objectives that are specialized for learning code change representations based on the contents of code changes. CCBERT perceives fine-grained code changes at the token level by learning from the old and new versions of the content, along with the edit actions. Our experiments demonstrate that CCBERT significantly outperforms CC2Vec or the state-of-the-art approaches of the downstream tasks by 7.7%–14.0% in terms of different metrics and tasks. CCBERT consistently outperforms large pre-trained code models, such as CodeBERT, while requiring 6–10× less training time, 5–30× less inference time, and 7.9× less GPU memory.}, journal={2023 IEEE INTERNATIONAL CONFERENCE ON SOFTWARE MAINTENANCE AND EVOLUTION, ICSME}, author={Zhou, Xin and Xu, Bowen and Han, DongGyun and Yang, Zhou and He, Junda and Lo, David}, year={2023}, pages={182–193} } @article{zhou_kim_xu_liu_han_lo_2023, title={The Devil is in the Tails: How Long-Tailed Code Distributions Impact Large Language Models}, ISSN={["1527-1366"]}, DOI={10.1109/ASE56229.2023.00157}, abstractNote={Learning-based techniques, especially advanced Large Language Models (LLMs) for code, have gained considerable popularity in various software engineering (SE) tasks. However, most existing works focus on designing better learning-based models and pay less attention to the properties of datasets. Learning-based models, including popular LLMs for code, heavily rely on data, and the data's properties (e.g., data distribution) could significantly affect their behavior. We conducted an exploratory study on the distribution of SE data and found that such data usually follows a skewed distribution (i.e., long-tailed distribution) where a small number of classes have an extensive collection of samples, while a large number of classes have very few samples. We investigate three distinct SE tasks and analyze the impacts of long-tailed distribution on the performance of LLMs for code. Our experimental results reveal that the long-tailed distribution has a substantial impact on the effectiveness of LLMs for code. Specifically, LLMs for code perform between 30.0% and 254.0% worse on data samples associated with infrequent labels compared to data samples of frequent labels. Our study provides a better understanding of the effects of long-tailed distributions on popular LLMs for code and insights for the future development of SE automation.}, journal={2023 38TH IEEE/ACM INTERNATIONAL CONFERENCE ON AUTOMATED SOFTWARE ENGINEERING, ASE}, author={Zhou, Xin and Kim, Kisub and Xu, Bowen and Liu, Jiakun and Han, DongGyun and Lo, David}, year={2023}, pages={40–52} }