@inproceedings{he-etal-2025-rewarding,
title = "Rewarding the Unlikely: Lifting {GRPO} Beyond Distribution Sharpening",
author = "He, Andre Wang and
Fried, Daniel and
Welleck, Sean",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/name-variant-enfa-fane/2025.emnlp-main.1298/",
doi = "10.18653/v1/2025.emnlp-main.1298",
pages = "25559--25571",
ISBN = "979-8-89176-332-6",
abstract = "Reinforcement learning is emerging as a primary driver for improving language model reasoning capabilities. A fundamental question is whether current reinforcement learning algorithms{---}such as Group Relative Policy Optimization (GRPO), the de facto standard algorithm used to improve language model reasoning{---}merely sharpen the base model{'}s distribution around problems it can already solve. We investigate this question in the context of formal theorem proving, which has access to a perfect verifier. We identify a degenerate rank bias in GRPO in which highly probable trajectories are reinforced and rare ones are neglected. This results in distribution sharpening: the model can solve some problems with fewer samples, but underperforms simply sampling more solutions from the original model. To overcome GRPO{'}s rank bias we introduce unlikeliness reward, a simple method for explicitly up-weighting rare but correct solutions. We show that unlikeliness reward mitigates rank bias and improves pass@$N$ across a large range of $N$ in both synthetic and real theorem proving settings. We also uncover an unexpected link between rank bias and a seemingly mundane hyperparameter{---}the number of updates per batch{---}that leads to a second, complementary mitigation. We combine our insights into a revised GRPO training recipe for formal theorem proving, yielding an open pipeline that achieves competitive performance to DeepSeek-Prover-V1.5-RL on the miniF2F-test benchmark."
}Markdown (Informal)
[Rewarding the Unlikely: Lifting GRPO Beyond Distribution Sharpening](https://preview.aclanthology.org/name-variant-enfa-fane/2025.emnlp-main.1298/) (He et al., EMNLP 2025)
ACL