@inproceedings{wen-etal-2025-thinkguard,
title = "{T}hink{G}uard: Deliberative Slow Thinking Leads to Cautious Guardrails",
author = "Wen, Xiaofei and
Zhou, Wenxuan and
Mo, Wenjie Jacky and
Chen, Muhao",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2025",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/display_plenaries/2025.findings-acl.704/",
pages = "13698--13713",
ISBN = "979-8-89176-256-5",
abstract = "Ensuring the safety of large language models (LLMs) is critical as they are deployed in real-world applications. Existing guardrails rely on rule-based filtering or single-pass classification, limiting their ability to handle nuanced safety violations. To address this, we propose ThinkGuard, a critique-augmented guardrail model that distills knowledge from high-capacity LLMs by generating structured critiques alongside safety labels. Fine-tuned on critique-augmented data, the captured deliberative thinking ability drastically enhances the guardrail{'}s cautiousness and interpretability. Evaluated on multiple safety benchmarks, ThinkGuard achieves the highest average F1 and AUPRC, outperforming all baselines. Compared to LLaMA Guard 3, ThinkGuard improves accuracy by 16.1{\%} and macro F1 by 27.0{\%}. Moreover, it surpasses label-only fine-tuned models, confirming that structured critiques enhance both classification precision and nuanced safety reasoning while maintaining computational efficiency."
}
Markdown (Informal)
[ThinkGuard: Deliberative Slow Thinking Leads to Cautious Guardrails](https://preview.aclanthology.org/display_plenaries/2025.findings-acl.704/) (Wen et al., Findings 2025)
ACL