@inproceedings{wang-etal-2026-thinking,
title = "More Thinking, Less Talking: Internalizing Deliberative Safety into {LLM} Parameters",
author = "Wang, Guan and
Tang, Xuehai and
Zhou, Biyu and
Han, Jizhong and
Hu, Songlin",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Proceedings of the 64th Annual Meeting of the {A}ssociation for {C}omputational {L}inguistics (Volume 1: Long Papers)",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl/2026.acl-long.1572/",
pages = "34079--34094",
ISBN = "979-8-89176-390-6",
abstract = "Prevailing safety alignment methods still leave Large Language Models (LLMs) vulnerable to sophisticated jailbreak attacks. To bolster defenses, explicit reasoning mechanisms like Safety-oriented Chain-of-Thought (SCoT) have emerged, significantly enhancing robustness. However, this transparency introduces a critical trade-off: the exposed reasoning process itself becomes a new attack surface, risking the leakage of harmful information and revealing the model{'}s safety logic to adversaries. This paper directly confronts this dilemma, asking: Can we achieve the full benefits of deliberative safety without the costs of explicit reasoning generation? We propose Safety Reasoning Internalization to make the deliberative process in SCoT ``available but not visible''. This approach is grounded in a key theoretical insight: the corrective influence of an SCoT can be effectively approximated by a targeted, low-rank update to the model{'}s Feed-Forward Network (FFN) layers. We operationalize this through Hierarchical Internalization of Adversarially-Guided Reasoning (HIAR), a layer-wise safety alignment framework that internalizes safety reasoning into an implicit computational pathway using Low-Rank Adaptation (LoRA). HIAR enables the model to reach a safe conclusion within a single forward pass, entirely eliminating the need to generate vulnerable SCoT text. Extensive experiments on various LLMs demonstrate that HIAR achieves a 43{\%} lower Attack Success Rate (ASR) against distinct jailbreak attacks compared to strong baselines."
}Markdown (Informal)
[More Thinking, Less Talking: Internalizing Deliberative Safety into LLM Parameters](https://preview.aclanthology.org/ingest-acl/2026.acl-long.1572/) (Wang et al., ACL 2026)
ACL