@inproceedings{kim-etal-2026-ag,
title = "{AG}-{GRPO}: Answer-Guided {GRPO} for Masked Diffusion Language Models",
author = "Kim, Juhyeong and
Kim, Gyunyeop and
Kang, Sangwoo",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Proceedings of the 64th Annual Meeting of the {A}ssociation for {C}omputational {L}inguistics (Volume 1: Long Papers)",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl/2026.acl-long.1724/",
pages = "37175--37191",
ISBN = "979-8-89176-390-6",
abstract = "Reinforcement learning with verifiable rewards (RLVR) typically evaluates only final outcomes, providing limited learning signal about whether the generated reasoning is consistent with the correct answer. As a result, even when ground-truth answers are available during training, on-policy rollouts can repeatedly produce reasoning that is inconsistent with the answer.We propose Answer-Guided Group Relative Policy Optimization (AG-GRPO) for masked diffusion language models (dLLMs), which generate text through iterative masked-token restoration. AG-GRPO combines standard answer-free (AF) rollouts, sampled without access to the ground-truth answer, with answer-guided (AG) rollouts. In AG rollouts, the model generates reasoning conditioned on an anchored ground-truth answer suffix, and then re-predicts the answer from the generated reasoning for reward computation. We compute group-relative advantages over the combined AF/AG rollout set, allowing answer-guided training signals to improve the answer-free policy used at test time.Across mathematics, puzzle-solving, and code-generation benchmarks, AG-GRPO consistently improves over the pretrained dLLM and prior RL method for masked dLLMs. We further analyze optimization dynamics to study how shared group-relative advantages support signal transfer and affect convergence. Our code is available at https://github.com/JuHyng/ag{\_}grpo."
}Markdown (Informal)
[AG-GRPO: Answer-Guided GRPO for Masked Diffusion Language Models](https://preview.aclanthology.org/ingest-acl/2026.acl-long.1724/) (Kim et al., ACL 2026)
ACL
- Juhyeong Kim, Gyunyeop Kim, and Sangwoo Kang. 2026. AG-GRPO: Answer-Guided GRPO for Masked Diffusion Language Models. In Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pages 37175–37191, San Diego, California, United States. Association for Computational Linguistics.