@inproceedings{wang-etal-2026-false,
title = "False Sense of Security: Why Probing-based Malicious Input Detection Fails to Generalize",
author = "Wang, Cheng and
Wei, Zeming and
Liu, Qin and
Zhou, Wenxuan and
Chen, Muhao",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {ACL} 2026",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl/2026.findings-acl.1300/",
pages = "26100--26113",
ISBN = "979-8-89176-395-1",
abstract = "Large Language Models (LLMs) can comply with harmful instructions, raising serious safety concerns despite their impressive capabilities. Recent work has leveraged probing-based approaches to study the separability of malicious and benign inputs in LLMs' internal representations, and researchers have proposed using such probing methods for safety detection. We systematically re-examine this paradigm. Motivated by poor out-of-distribution performance, we hypothesize that probes learn superficial patterns rather than semantic harmfulness. Through controlled experiments, we confirm this hypothesis and identify the specific patterns learned: instructional patterns and trigger words. Our investigation follows a systematic approach, progressing from demonstrating comparable performance of simple n-gram methods, to controlled experiments with semantically cleaned datasets, to detailed analysis of pattern dependencies. These results reveal a false sense of security around current probing-based approaches and highlight the need to redesign both models and evaluation protocols, for which we provide further discussions in the hope of suggesting responsible further research in this direction."
}Markdown (Informal)
[False Sense of Security: Why Probing-based Malicious Input Detection Fails to Generalize](https://preview.aclanthology.org/ingest-acl/2026.findings-acl.1300/) (Wang et al., Findings 2026)
ACL