@inproceedings{zhang-etal-2025-intention,
title = "Intention Analysis Makes {LLM}s A Good Jailbreak Defender",
author = "Zhang, Yuqi and
Ding, Liang and
Zhang, Lefei and
Tao, Dacheng",
editor = "Rambow, Owen and
Wanner, Leo and
Apidianaki, Marianna and
Al-Khalifa, Hend and
Eugenio, Barbara Di and
Schockaert, Steven",
booktitle = "Proceedings of the 31st International Conference on Computational Linguistics",
month = jan,
year = "2025",
address = "Abu Dhabi, UAE",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/fix-sig-urls/2025.coling-main.199/",
pages = "2947--2968",
abstract = "Aligning large language models (LLMs) with human values, particularly when facing complex and stealthy jailbreak attacks, presents a formidable challenge. Unfortunately, existing methods often overlook this intrinsic nature of jailbreaks, which limits their effectiveness in such complex scenarios. In this study, we present a simple yet highly effective defense strategy, i.e., Intention Analysis (IA). IA works by triggering LLMs' inherent self-correct and improve ability through a two-stage process: 1) analyzing the essential intention of the user input, and 2) providing final policy-aligned responses based on the first round conversation. Notably,IA is an inference-only method, thus could enhance LLM safety without compromising their helpfulness. Extensive experiments on varying jailbreak benchmarks across a wide range of LLMs show that IA could consistently and significantly reduce the harmfulness in responses (averagely -48.2{\%} attack success rate). Encouragingly, with our IA, Vicuna-7B even outperforms GPT-3.5 regarding attack success rate. We empirically demonstrate that, to some extent, IA is robust to errors in generated intentions. Further analyses reveal the underlying principle of IA: suppressing LLM{'}s tendency to follow jailbreak prompts, thereby enhancing safety."
}
Markdown (Informal)
[Intention Analysis Makes LLMs A Good Jailbreak Defender](https://preview.aclanthology.org/fix-sig-urls/2025.coling-main.199/) (Zhang et al., COLING 2025)
ACL