@inproceedings{borile-abrate-2025-generalize,
title = "How to Generalize the Detection of {AI}-Generated Text: Confounding Neurons",
author = "Borile, Claudio and
Abrate, Carlo",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2025",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/author-page-yu-wang-polytechnic/2025.findings-emnlp.1388/",
doi = "10.18653/v1/2025.findings-emnlp.1388",
pages = "25461--25476",
ISBN = "979-8-89176-335-7",
abstract = "Detectors of LLM-generated text suffer from poor domain shifts generalization ability. Yet, reliable text detection methods in the wild are of paramount importance for plagiarism detection, integrity of the public discourse, and AI safety. Linguistic and domain confounders introduce spurious correlations, leading to poor out-of-distribution (OOD) performance. In this work we introduce the concept of confounding neurons, individual neurons within transformers-based detectors that encode dataset-specific biases rather than task-specific signals. Leveraging confounding neurons, we propose a novel post-hoc, neuron-level intervention framework to disentangle AI-generated text detection factors from data-specific biases. Through extensive experiments we prove its ability to effectively reduce topic-specific biases, enhancing the model{'}s ability to generalize across domains."
}Markdown (Informal)
[How to Generalize the Detection of AI-Generated Text: Confounding Neurons](https://preview.aclanthology.org/author-page-yu-wang-polytechnic/2025.findings-emnlp.1388/) (Borile & Abrate, Findings 2025)
ACL