@inproceedings{huang-etal-2026-self,
title = "Self-Reflection Improves Safety of Large Reasoning Models",
author = "Huang, Qiang and
Zhai, Wei and
Huang, Feng and
Dou, Dejing",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {ACL} 2026",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl/2026.findings-acl.678/",
pages = "13858--13874",
ISBN = "979-8-89176-395-1",
abstract = "Large Reasoning Models(LRMs) have achieved significant breakthroughs over prior large language models{~}(LLMs), but they also entail greater potential safety risks. Existing alignment methods often remain at a shallow level of protection, making them insufficient to address deeper risks and strategic attacks in complex reasoning processes. To bridge this gap, we move beyond the conventional paradigm that treats safety alignment merely as a preventive measure to reduce harmful outputs. Drawing inspiration from human-like introspection and self-correction, we propose Self-Reflection, a technique that introduces a special Self-Reflection token, enabling LRMs to perform Self-Reflection during generation and recover from harmful outputs. Our approach integrates seamlessly into standard post-training paradigms , further enhancing both helpfulness and safety. The experimental results demonstrate that models trained with Self-Reflection not only consistently outperform the baseline in terms of safety (reducing the HCR from 13.8{\%} to 4.1{\%}, nearly a threefold improvement over mainstream approaches), but also achieve substantial advantages in both helpfulness and the safety{--}helpfulness balance. More importantly, under evaluations involving various adversarial attacks, including a specially designed adaptive attack, the Self-Reflection mechanism significantly enhances model safety without targeted adversarial training.Notice: This paper contains harmful content."
}Markdown (Informal)
[Self-Reflection Improves Safety of Large Reasoning Models](https://preview.aclanthology.org/ingest-acl/2026.findings-acl.678/) (Huang et al., Findings 2026)
ACL
- Qiang Huang, Wei Zhai, Feng Huang, and Dejing Dou. 2026. Self-Reflection Improves Safety of Large Reasoning Models. In Findings of the Association for Computational Linguistics: ACL 2026, pages 13858–13874, San Diego, California, United States. Association for Computational Linguistics.