@inproceedings{kim-kang-2026-cpc,
title = "{CPC}-{GRPO}: Answer-Free Reinforcement Learning with Cross-Prompt Consensus Rewards",
author = "Kim, Gyunyeop and
Kang, Sangwoo",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {ACL} 2026",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl/2026.findings-acl.1486/",
pages = "29733--29748",
ISBN = "979-8-89176-395-1",
abstract = "Reinforcement learning with verifiable rewards has improved reasoning in language models, but it typically relies on a ground-truth answer or an external verifier, which limits applicability and increases cost. We propose an answer-free training objective that derives rewards solely from the model{'}s own probabilities by exploiting prompt paraphrases as multiple semantic views of the same intent. For each paraphrase set, we generate candidate responses, rescore each response under the other paraphrased prompts via teacher forcing, and define a cross-prompt consensus reward that serves as a practical internal training signal, favoring responses supported across views rather than those that fit only a single phrasing. We optimize this reward using a policy update with an all-pairs objective and advantage broadcasting across prompt{--}response pairs. The framework naturally supports prefix-level training, enabling a controllable cost{--}signal trade-off. Experiments on RobustAlpacaEval and out-of-domain reasoning benchmarks (OpenBookQA, AQuA, HumanEval) show strong in-domain gains and competitive or improved average out-of-domain performance over pre-trained and answer-free training baselines on LLaMA3.2-3B and Qwen3-4B, alongside analyses demonstrating reward{--}performance alignment and the importance of design choices such as excluding self-view scores and ensembling-based candidates. All experiment code is available at our GitHub."
}Markdown (Informal)
[CPC-GRPO: Answer-Free Reinforcement Learning with Cross-Prompt Consensus Rewards](https://preview.aclanthology.org/ingest-acl/2026.findings-acl.1486/) (Kim & Kang, Findings 2026)
ACL