@article{basak_neil_reaves_williams_2023, title={SecretBench: A Dataset of Software Secrets}, ISSN={["2160-1852"]}, DOI={10.1109/MSR59073.2023.00053}, abstractNote={According to GitGuardian’s monitoring of public GitHub repositories, the exposure of secrets (API keys and other credentials) increased two-fold in 2021 compared to 2020, totaling more than six million secrets. However, no benchmark dataset is publicly available for researchers and tool developers to evaluate secret detection tools that produce many false positive warnings. The goal of our paper is to aid researchers and tool developers in evaluating and improving secret detection tools by curating a benchmark dataset of secrets through a systematic collection of secrets from open-source repositories. We present a labeled dataset of source codes containing 97,479 secrets (of which 15,084 are true secrets) of various secret types extracted from 818 public GitHub repositories. The dataset covers 49 programming languages and 311 file types.}, journal={2023 IEEE/ACM 20TH INTERNATIONAL CONFERENCE ON MINING SOFTWARE REPOSITORIES, MSR}, author={Basak, Setu Kumar and Neil, Lorenzo and Reaves, Bradley and Williams, Laurie}, year={2023}, pages={347–351} } @article{basak_neil_reaves_williams_2023, title={What Challenges Do Developers Face About Checked-in Secrets in Software Artifacts?}, ISSN={["0270-5257"]}, DOI={10.1109/ICSE48619.2023.00141}, abstractNote={Throughout 2021, GitGuardian's monitoring of public GitHub repositories revealed a two-fold increase in the number of secrets (database credentials, API keys, and other credentials) exposed compared to 2020, accumulating more than six million secrets. To our knowledge, the challenges developers face to avoid checked-in secrets are not yet characterized. The goal of our paper is to aid researchers and tool developers in understanding and prioritizing opportunities for future research and tool automation for mitigating checked-in secrets through an empirical investigation of challenges and solutions related to checked-in secrets. We extract 779 questions related to checked-in secrets on Stack Exchange and apply qualitative analysis to determine the challenges and the solutions posed by others for each of the challenges. We identify 27 challenges and 13 solutions. The four most common challenges, in ranked order, are: (i) store/version of secrets during deployment; (ii) store/version of secrets in source code; (iii) ignore/hide of secrets in source code; and (iv) sanitize VCS history. The three most common solutions, in ranked order, are: (i) move secrets out of source code/version control and use template config file; (ii) secret management in deployment; and (iii) use local environment variables. Our findings indicate that the same solution has been mentioned to mitigate multiple challenges. However, our findings also identify an increasing trend in questions lacking accepted solutions substantiating the need for future research and tool automation on managing secrets.}, journal={2023 IEEE/ACM 45TH INTERNATIONAL CONFERENCE ON SOFTWARE ENGINEERING, ICSE}, author={Basak, Setu Kumar and Neil, Lorenzo and Reaves, Bradley and Williams, Laurie}, year={2023}, pages={1635–1647} } @article{basak_neil_reaves_williams_2022, title={What are the Practices for Secret Management in Software Artifacts?}, DOI={10.1109/SecDev53368.2022.00026}, abstractNote={Throughout 2021, GitGuardian's monitoring of public GitHub repositories revealed a two-fold increase in the number of secrets (database credentials, API keys, and other credentials) exposed compared to 2020, accumulating more than six million secrets. A systematic derivation of practices for managing secrets can help practitioners in secure development. The goal of our paper is to aid practitioners in avoiding the exposure of secrets by identifying secret management practices in software artifacts through a systematic derivation of practices disseminated in Internet artifacts. We conduct a grey literature review of Internet artifacts, such as blog articles and question and answer posts. We identify 24 practices grouped in six categories comprised of developer and organizational practices. Our findings indicate that using local environment variables and external secret management services are the most recommended practices to move secrets out of source code and to securely store secrets. We also observe that using version control system scanning tools and employing short-lived secrets are the most recommended practices to avoid accidentally committing secrets and limit secret exposure, respectively.}, journal={2022 IEEE SECURE DEVELOPMENT CONFERENCE (SECDEV 2022)}, author={Basak, Setu Kumar and Neil, Lorenzo and Reaves, Bradley and Williams, Laurie}, year={2022}, pages={69–76} }