@inproceedings{joshi-etal-2025-saber,
title = "{SABER}: Uncovering Vulnerabilities in Safety Alignment via Cross-Layer Residual Connection",
author = "Joshi, Maithili and
Nandi, Palash and
Chakraborty, Tanmoy",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-emnlp/2025.emnlp-main.825/",
pages = "16310--16325",
ISBN = "979-8-89176-332-6",
abstract = "Large Language Models (LLMs) with safe-alignment training are powerful instruments with robust language comprehension capability. Typically LLMs undergo careful alignment training involving human feedback to ensure the acceptance of safe inputs while rejection of harmful or unsafe ones. However, these humongous models are still vulnerable to jailbreak attacks, in which malicious users attempt to generate harmful outputs that safety-aligned LLMs are trained to avoid. In this study, we find that the safety mechanisms in LLMs are predominantly prevalent in the middle-to-late layers. Based on this observation, we introduce a novel white-box jailbreak method SABER (Safety Alignment Bypass via Extra Residuals) that connects two intermediate layer s and e such that s{\ensuremath{<}}e with a residual connection, achieving an improvement of 51{\%} over the best performing baseline GCG on HarmBench test set. Moreover, model demonstrates only a marginal shift in perplexity when evaluated on the validation set of HarmBench."
}Markdown (Informal)
[SABER: Uncovering Vulnerabilities in Safety Alignment via Cross-Layer Residual Connection](https://preview.aclanthology.org/ingest-emnlp/2025.emnlp-main.825/) (Joshi et al., EMNLP 2025)
ACL