@inproceedings{choi-zu-2025-exploring,
title = "Exploring the Interpretability of {AI}-Generated Response Detection with Probing",
author = "Choi, Ikkyu and
Zu, Jiyun",
editor = "Wilson, Joshua and
Ormerod, Christopher and
Beiting Parrish, Magdalen",
booktitle = "Proceedings of the Artificial Intelligence in Measurement and Education Conference (AIME-Con): Coordinated Session Papers",
month = oct,
year = "2025",
address = "Wyndham Grand Pittsburgh, Downtown, Pittsburgh, Pennsylvania, United States",
publisher = "National Council on Measurement in Education (NCME)",
url = "https://preview.aclanthology.org/name-variant-enfa-fane/2025.aimecon-sessions.12/",
pages = "99--106",
ISBN = "979-8-218-84230-7",
abstract = "Multiple strategies for AI-generated response detection have been proposed, with many high-performing ones built on language models. However, the decision-making processes of these detectors remain largely opaque. We addressed this knowledge gap by fine-tuning a language model for the detection task and applying probing techniques using adversarial examples. Our adversarial probing analysis revealed that the fine-tuned model relied heavily on a narrow set of lexical cues in making the classification decision. These findings underscore the importance of interpretability in AI-generated response detectors and highlight the value of adversarial probing as a tool for exploring model interpretability."
}Markdown (Informal)
[Exploring the Interpretability of AI-Generated Response Detection with Probing](https://preview.aclanthology.org/name-variant-enfa-fane/2025.aimecon-sessions.12/) (Choi & Zu, AIME-Con 2025)
ACL