@inproceedings{luo-etal-2026-safety,
title = "Safety Guardrails of Large Language Models Are Vulnerable to Value-Driven Adversarial Prompting",
author = "Luo, Xiaohao and
Wei, Ying and
Li, Zhijun",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {ACL} 2026",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl/2026.findings-acl.1357/",
pages = "27238--27255",
ISBN = "979-8-89176-395-1",
abstract = "In the real world, the execution of a task often depends on the executor{'}s recognition of its value. Motivated by this observation, we propose the value-driven jailbreak attack (VDJA), a simple and effective black-box jailbreak method against large language models (LLMs). VDJA first exploits the phenomenon that LLMs tend to agree with humans to induce LLMs to affirm the moral value of harmful tasks. During autoregressive generation, these value-endorsement tokens function as an implicit value prior, making LLMs more likely to accept and generate harmful content. Extensive experiments on five state-of-the-art (SOTA) LLMs demonstrate the superiority of VDJA. Using only a single query and without concealing harmful instructions, VDJA achieves an average attack success rate (ASR) of 91.8{\%} on JailbreakBench and 95.2{\%} on the AdvBench subset, showcasing SOTA jailbreak success rates and attack efficiency. Most importantly, our work suggests a previously underexplored vulnerability in the safety guardrails of LLMs, which highlights the urgent need to enhance their robustness."
}Markdown (Informal)
[Safety Guardrails of Large Language Models Are Vulnerable to Value-Driven Adversarial Prompting](https://preview.aclanthology.org/ingest-acl/2026.findings-acl.1357/) (Luo et al., Findings 2026)
ACL