@inproceedings{wang-etal-2025-vulnerability,
title = "Vulnerability of Large Language Models to Output Prefix Jailbreaks: Impact of Positions on Safety",
author = "Wang, Yiwei and
Chen, Muhao and
Peng, Nanyun and
Chang, Kai-Wei",
editor = "Chiruzzo, Luis and
Ritter, Alan and
Wang, Lu",
booktitle = "Findings of the Association for Computational Linguistics: NAACL 2025",
month = apr,
year = "2025",
address = "Albuquerque, New Mexico",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/fix-sig-urls/2025.findings-naacl.219/",
pages = "3939--3952",
ISBN = "979-8-89176-195-7",
abstract = "Previous research on jailbreak attacks has mainly focused on optimizing the adversarial snippet content injected into input prompts to expose LLM security vulnerabilities. A significant portion of this research focuses on developing more complex, less readable adversarial snippets that can achieve higher attack success rates. In contrast to this trend, our research investigates the impact of the adversarial snippet{'}s position on the effectiveness of jailbreak attacks. We find that placing a simple and readable adversarial snippet at the beginning of the output effectively exposes LLM safety vulnerabilities, leading to much higher attack success rates than the input suffix attack or prompt-based output jailbreaks. Precisely speaking, we discover that directly enforcing the user{'}s target embedded output prefix is an effective method to expose LLMs' safety vulnerabilities."
}
Markdown (Informal)
[Vulnerability of Large Language Models to Output Prefix Jailbreaks: Impact of Positions on Safety](https://preview.aclanthology.org/fix-sig-urls/2025.findings-naacl.219/) (Wang et al., Findings 2025)
ACL