@inproceedings{zhang-etal-2024-code,
title = "Code Membership Inference for Detecting Unauthorized Data Use in Code Pre-trained Language Models",
author = "Zhang, Sheng and
Li, Hui and
Ji, Rongrong",
editor = "Al-Onaizan, Yaser and
Bansal, Mohit and
Chen, Yun-Nung",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2024",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/jlcl-multiple-ingestion/2024.findings-emnlp.621/",
doi = "10.18653/v1/2024.findings-emnlp.621",
pages = "10593--10603",
abstract = "Code pre-trained language models (CPLMs) have received great attention since they can benefit various tasks that facilitate software development and maintenance. However, CPLMs are trained on massive open-source code, raising concerns about potential data infringement. This paper launches the study of detecting unauthorized code use in CPLMs, i.e., Code Membership Inference (CMI) task. We design a framework Buzzer for different settings of CMI. Buzzer deploys several inference techniques, including signal extraction from pre-training tasks, hard-to-learn sample calibration and weighted inference, to identify code membership status accurately. Extensive experiments show that CMI can be achieved with high accuracy using Buzzer. Hence, Buzzer can serve as a CMI tool and help protect intellectual property rights. The implementation of Buzzer is available at: https://github.com/KDEGroup/Buzzer"
}
Markdown (Informal)
[Code Membership Inference for Detecting Unauthorized Data Use in Code Pre-trained Language Models](https://preview.aclanthology.org/jlcl-multiple-ingestion/2024.findings-emnlp.621/) (Zhang et al., Findings 2024)
ACL