@inproceedings{zhang-etal-2026-llm,
title = "{LLM}-{VA}: Resolving the Jailbreak-Overrefusal Trade-off via Vector Alignment",
author = "Zhang, Haonan and
Wang, Dongxia and
Liu, Yi and
Chen, Kexin and
Wang, Wenhai",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Proceedings of the 64th Annual Meeting of the {A}ssociation for {C}omputational {L}inguistics (Volume 1: Long Papers)",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl/2026.acl-long.260/",
pages = "5760--5776",
ISBN = "979-8-89176-390-6",
abstract = "Safety-aligned LLMs suffer from two failure modes: jailbreak (responding to harmful inputs) and over-refusal (declining benign queries). Existing vector steering methods adjust the magnitude of answer vectors, but this creates a fundamental trade-off{---}reducing jailbreak increases over-refusal and vice versa. We identify the root cause: LLMs encode the decision to respond (answer vector $v_a$) and the judgment of input safety (benign vector $v_b$) as nearly orthogonal directions, treating them as independent processes. We propose LLM-VA, which aligns $v_a$ with $v_b$ through closed-form weight updates, making the model{'}s willingness to respond causally dependent on its safety assessment{---}without fine-tuning or architectural changes. Our method identifies vectors at each layer using SVMs, selects safety-relevant layers, and iteratively aligns vectors via minimum-norm weight modifications. Experiments on 12 LLMs demonstrate that LLM-VA achieves 11.45{\%} higher F1 than the best baseline while preserving 95.92{\%} utility, and automatically adapts to each model{'}s safety bias without manual tuning.Code and models are available at https://hotbento.github.io/LLM-VA-Web/."
}Markdown (Informal)
[LLM-VA: Resolving the Jailbreak-Overrefusal Trade-off via Vector Alignment](https://preview.aclanthology.org/ingest-acl/2026.acl-long.260/) (Zhang et al., ACL 2026)
ACL