@inproceedings{wang-etal-2025-secdecoding,
title = "{S}ec{D}ecoding: Steerable Decoding for Safer {LLM} Generation",
author = "Wang, Jiayou and
Liu, Rundong and
Hu, Yue and
Wu, Huijia and
He, Zhaofeng",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2025",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/author-page-yu-wang-polytechnic/2025.findings-emnlp.1118/",
doi = "10.18653/v1/2025.findings-emnlp.1118",
pages = "20504--20521",
ISBN = "979-8-89176-335-7",
abstract = "Large language models (LLMs) have achieved remarkable performance across diverse tasks, yet ensuring output safety remains a fundamental challenge. Existing defense methods often suffer from limited generalization, high computational overhead, or significant utility degradation. In this work, we present SecDecoding, a lightweight decoding-time defense framework that significantly improves output safety without compromising model helpfulness. SecDecoding leverages a pair of small contrastive models, namely a base model and a safety fine-tuned expert, to estimate token-level safety signals by measuring divergence in their output distributions. These signals dynamically steer the target model{'}s generation toward safer trajectories, effectively suppressing unsafe content. Experimental results show that SecDecoding achieves near-zero attack success rates against a wide spectrum of advanced jailbreak attacks across multiple LLMs, while maintaining the model{'}s helpfulness with minimal degradation. Additionally, SecDecoding is a modular and resource-efficient approach that requires only an auxiliary 1-billion-parameter model and is compatible with speculative decoding, offering up to 1.5{\texttimes} inference speedup."
}Markdown (Informal)
[SecDecoding: Steerable Decoding for Safer LLM Generation](https://preview.aclanthology.org/author-page-yu-wang-polytechnic/2025.findings-emnlp.1118/) (Wang et al., Findings 2025)
ACL