@inproceedings{long-etal-2025-jailbreak,
title = "How Jailbreak Defenses Work and Ensemble? A Mechanistic Investigation",
author = "Long, Zhuohan and
Wang, Siyuan and
Liu, Shujun and
Lai, Yuhang",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2025",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/name-variant-enfa-fane/2025.findings-emnlp.1160/",
doi = "10.18653/v1/2025.findings-emnlp.1160",
pages = "21263--21290",
ISBN = "979-8-89176-335-7",
abstract = "Jailbreak attacks, where harmful prompts bypass generative models' built-in safety, raise serious concerns about model vulnerability. While many defense methods have been proposed, the trade-offs between safety and helpfulness, and their application to Large Vision-Language Models (LVLMs), are not well understood. This paper systematically examines jailbreak defenses by reframing the standard generation task as a binary classification problem to assess model refusal tendencies for both harmful and benign queries. We identify two key defense mechanisms: \textit{safety shift}, which increases refusal rates across all queries, and \textit{harmfulness discrimination}, which improves the model{'}s ability to differentiate between harmful and benign inputs. Using these mechanisms, we develop two ensemble defense strategies{---}inter-mechanism and intra-mechanism ensembles{---}to balance safety and helpfulness. Experiments on the MM-SafetyBench and MOSSBench datasets with LLaVA-1.5 models show that these strategies effectively improve model safety or optimize the trade-off between safety and helpfulness."
}Markdown (Informal)
[How Jailbreak Defenses Work and Ensemble? A Mechanistic Investigation](https://preview.aclanthology.org/name-variant-enfa-fane/2025.findings-emnlp.1160/) (Long et al., Findings 2025)
ACL