@inproceedings{bao-etal-2025-fixing,
title = "Fixing Distribution Shifts of {LLM} Self-Critique via On-Policy Self-Play Training",
author = "Bao, Rong and
Yu, Donglei and
Fan, Kai and
Liao, Minpeng",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingestion-acl-25/2025.acl-long.865/",
pages = "17680--17700",
ISBN = "979-8-89176-251-0",
abstract = "Self-critique mechanisms significantly improve the performance of language models in complex reasoning tasks by giving them the ability to correct errors, conduct induction and deduction, and switch thinking insights. However, synthetic data methods often require human-introduced errors or sampling of the model{'}s reasoning results from the previous moment, and the current output distribution of the model cannot be obtained, makes the data for critique and reasoning face the problem of distribution shifts. In this work, we propose an on-policy reinforcement learning framework to synchronize the reasoning and critique capabilities of language models. To alleviate reward hacking caused by outcome-based supervision, we design a deliberate reward framework for different purposes. The reward framework not only supervises the model reasoning process based on the results, but also uses Monte Carlo sampling to give appropriate rewards to the critique content according to the success rate of the model{'}s correction after critique. In addition, we introduce a rule-based reward function to impose penalties on the model when it generates hallucinatory critiques. When our approach is applied to the DeepSeek-Math-7B-Base and Qwen2.5-7B-Base models, model performance improves 5.40 and 3.66 points, respectively, compared to the best baseline approach. This validates the significant advantages of our method in improving model{'}s reasoning and self-critique capability. Code will be made available at https://github.com/rbao2018/SCOP"
}
Markdown (Informal)
[Fixing Distribution Shifts of LLM Self-Critique via On-Policy Self-Play Training](https://preview.aclanthology.org/ingestion-acl-25/2025.acl-long.865/) (Bao et al., ACL 2025)
ACL