@inproceedings{tomar-etal-2025-unsafechain,
title = "{U}nsafe{C}hain: Enhancing Reasoning Model Safety via Hard Cases",
author = "Tomar, Raj Vardhan and
Nakov, Preslav and
Wang, Yuxia",
editor = "Inui, Kentaro and
Sakti, Sakriani and
Wang, Haofen and
Wong, Derek F. and
Bhattacharyya, Pushpak and
Banerjee, Biplab and
Ekbal, Asif and
Chakraborty, Tanmoy and
Singh, Dhirendra Pratap",
booktitle = "Proceedings of the 14th International Joint Conference on Natural Language Processing and the 4th Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics",
month = dec,
year = "2025",
address = "Mumbai, India",
publisher = "The Asian Federation of Natural Language Processing and The Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-ijcnlp-aacl/2025.findings-ijcnlp.75/",
pages = "1233--1247",
ISBN = "979-8-89176-303-6",
abstract = "As large reasoning models (LRMs) grow more capable, chain-of-thought (CoT) reasoning introduces new safety challenges. Existing SFT-based safety alignment studies dominantly focused on filtering prompts with safe, high-quality responses, while overlooking hard prompts that always elicit harmful outputs. To fill this gap, we introduce UnsafeChain, a safety alignment dataset constructed from hard prompts with diverse sources, where unsafe completions are identified and explicitly corrected into safe responses. By exposing models to unsafe behaviors and guiding their correction, UnsafeChain enhances safety while preserving general reasoning ability. We fine-tune three LRMs on UnsafeChain and compare them against recent SafeChain and STAR-1 across six out-of-distribution and five in-distribution benchmarks. UnsafeChain consistently outperforms prior datasets (21 wins out of 36 settings), with even a small selected-1K subset matching or surpassing baseline performance, demonstrating the effectiveness and generalizability of correction-based supervision.We release our dataset and code at https://github.com/mbzuai-nlp/UnsafeChain."
}Markdown (Informal)
[UnsafeChain: Enhancing Reasoning Model Safety via Hard Cases](https://preview.aclanthology.org/ingest-ijcnlp-aacl/2025.findings-ijcnlp.75/) (Tomar et al., Findings 2025)
ACL
- Raj Vardhan Tomar, Preslav Nakov, and Yuxia Wang. 2025. UnsafeChain: Enhancing Reasoning Model Safety via Hard Cases. In Proceedings of the 14th International Joint Conference on Natural Language Processing and the 4th Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics, pages 1233–1247, Mumbai, India. The Asian Federation of Natural Language Processing and The Association for Computational Linguistics.