@inproceedings{luo-etal-2026-detecting,
title = "Detecting What Queries Seek: Steering {LLM} Safety with {FFN} Output Activation Monitoring",
author = "Luo, Xiaohao and
Wei, Ying and
Zhao, Rui",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Proceedings of the 64th Annual Meeting of the {A}ssociation for {C}omputational {L}inguistics (Volume 1: Long Papers)",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl/2026.acl-long.1360/",
pages = "29500--29514",
ISBN = "979-8-89176-390-6",
abstract = "Recently, activation steering has attracted considerable attention as a low-cost approach to improving the safety of large language models (LLMs). However, most existing methods apply interventions indiscriminately, often causing excessive refusal of benign queries. Although recent works have begun to explore selective intervention, their intervention decisions typically rely on residual stream activations where information is highly entangled, resulting in limited discriminative power and unreliable interventions. To address this issue, we propose FFN-Guided activation steering (FGAS). Motivated by the observation that feed-forward networks (FFNs) in LLMs serve as core modules for knowledge storage, we propose leveraging FFN output activations as more discriminative signals for intervention, since these activations more explicitly reflect the intent of a query. For a given query, FGAS projects the corresponding FFN output activation into a low-dimensional subspace that effectively separates harmful and benign queries, and then makes precise intervention decisions by assessing its similarity to pre-constructed prototype activations representing harmful and benign classes. Extensive experiments demonstrate that FGAS achieves state-of-the-art defense performance against various jailbreak attacks, while nearly preserving the model{'}s original performance on benign tasks."
}Markdown (Informal)
[Detecting What Queries Seek: Steering LLM Safety with FFN Output Activation Monitoring](https://preview.aclanthology.org/ingest-acl/2026.acl-long.1360/) (Luo et al., ACL 2026)
ACL