@inproceedings{wei-huang-2026-mmr,
title = "{MMR}-{GRPO}: Accelerating {GRPO}-Style Training through Diversity-Aware Reward Reweighting",
author = "Wei, Kangda and
Huang, Ruihong",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {ACL} 2026",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl/2026.findings-acl.467/",
pages = "9584--9605",
ISBN = "979-8-89176-395-1",
abstract = "Group Relative Policy Optimization (GRPO) has become a standard approach for training mathematical reasoning models; however, GRPO training is computationally intensive and usually takes a long time, which consumes substantial computational resources and creates barriers for academic researchers and smaller organizations with limited GPU budgets. In this paper, we propose MMR-GRPO to accelerate GRPO training and reduce the overall training time required to reach peak performance, and the approach adopts Maximal Marginal Relevanceto reweigh rewards of multiple rollouts by balancing rollout quality with diversity to reduce rollout redundancy. The rationale is that redundant or similar completions, if repeatedly used to train a model, will create an ``exploitation trap'' and slow down model convergence in GRPO style reinforcement learning. Extensive evaluations across three model sizes (1.5B, 7B, 8B), three GRPO variants, and five mathematical reasoning benchmarks show that MMR-GRPO achieves comparable peak performance while requiring on average 47.9{\%} fewer training steps and 70.2{\%} less wall-clock time. These gains are consistent across models, methods, and benchmarks. Our code is released at: https://github.com/WeiKangda/MMR-GRPO."
}Markdown (Informal)
[MMR-GRPO: Accelerating GRPO-Style Training through Diversity-Aware Reward Reweighting](https://preview.aclanthology.org/ingest-acl/2026.findings-acl.467/) (Wei & Huang, Findings 2026)
ACL