@inproceedings{gu-etal-2025-improve,
title = "Improve Safety Training of Large Language Models with Safety-Critical Singular Vectors Localization",
author = "Gu, Peijian and
Wang, Quan and
Mao, Zhendong",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingestion-acl-25/2025.acl-long.245/",
pages = "4941--4954",
ISBN = "979-8-89176-251-0",
abstract = "The rapid advancement of large language models (LLMs) has brought about increased concerns regarding their safety, especially as adversaries develop jailbreak techniques to bypass LLMs' safety mechanism. Although recent work on safety training with modules such as low-rank adaptation (LoRA) to resist jailbreaks shows promise, these approaches can inadvertently degrade a model{'}s general utility. In this paper, we propose a novel plug-and-play method that mitigates the impact of safety training on model utility by explicitly locating and leveraging safety-critical singular vectors, which only contribute to safety, within the model{'}s parameter space. We quantify the safety-criticality of each singular vector as the difference of their importance for safety and utility measured by a corresponding low-rank projection. The top scored singular vectors are located as safety-critical and are used to initialize the LoRA modules within existing safety training methods in a plug-and-play manner, thereby constraining the training updates within safety-critical parameters. Additionally, we propose a dynamic rank number determination strategy to further reduce parameter overhead. Experiments on HarmBench with multiple jailbreak methods validate the effectiveness of our approach in safety training, while evaluations on several utility benchmarks demonstrate that our method successfully mitigates the adverse impact of safety training on model utility, enhancing the utility performance of the evaluated safety training baselines."
}
Markdown (Informal)
[Improve Safety Training of Large Language Models with Safety-Critical Singular Vectors Localization](https://preview.aclanthology.org/ingestion-acl-25/2025.acl-long.245/) (Gu et al., ACL 2025)
ACL