@inproceedings{orel-etal-2025-codet,
title = "{C}o{D}et-M4: Detecting Machine-Generated Code in Multi-Lingual, Multi-Generator and Multi-Domain Settings",
author = "Orel, Daniil and
Azizov, Dilshod and
Nakov, Preslav",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2025",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/display_plenaries/2025.findings-acl.550/",
pages = "10570--10593",
ISBN = "979-8-89176-256-5",
abstract = "Large Language Models (LLMs) have revolutionized code generation, automating programming with remarkable efficiency. However, this has had important consequences for programming skills, ethics, and assessment integrity, thus making the detection of LLM-generated code essential for maintaining accountability and standards. While, there has been some previous research on this problem, it generally lacks domain coverage and robustness, and only covers a small number of programming languages. Here, we aim to bridge this gap. In particular, we propose a framework capable of distinguishing between human-written and LLM-generated program code across multiple programming languages, code generators, and domains. We use a large-scale dataset from renowned platforms and LLM-based code generators, alongside applying rigorous data quality checks, feature engineering, and comparative analysis of traditional machine learning models, pre-trained language models (PLMs), and LLMs for code detection. We perform an evaluation on out-of-domain scenarios, such as detecting authorship and hybrid authorship of generated code and generalizing to unseen models, domains, and programming languages. Our extensive experiments show that our framework effectively distinguishes human-written from LLM-generated program code, setting a new benchmark for the task."
}
Markdown (Informal)
[CoDet-M4: Detecting Machine-Generated Code in Multi-Lingual, Multi-Generator and Multi-Domain Settings](https://preview.aclanthology.org/display_plenaries/2025.findings-acl.550/) (Orel et al., Findings 2025)
ACL