@inproceedings{zhang-etal-2026-better,
title = "Better {LLM} Reasoning via Dual-Play",
author = "Zhang, Zhengxin and
Huang, Chengyu and
Li, Aochong Oliver and
Cardie, Claire",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {ACL} 2026",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl/2026.findings-acl.1752/",
pages = "35111--35139",
ISBN = "979-8-89176-395-1",
abstract = "Large Language Models (LLMs) have achieved remarkable progress through Reinforcement Learning with Verifiable Rewards (RLVR), yet still rely heavily on external supervision (e.g., curated labels). Adversarial learning, particularly through self-play, offers a promising alternative that enables models to learn from themselves{---}thus reducing reliance on external supervision. Dual-play extends adversarial learning by assigning specialized roles to two models and training them against each other, fostering sustained competition and mutual evolution. Despite its promise, adapting dual-play training to LLMs remains limited. In this paper, we introduce PasoDoble, a novel LLM dual-play framework. PasoDoble adversarially trains two models initialized from the same base model: a Proposer, which generates challenging questions with ground-truth answers, and a Solver, which attempts to solve them. We enrich the Proposer with knowledge from a pre-training dataset to ensure the questions' quality and diversity. To avoid reward hacking, the Proposer is rewarded for producing only valid questions that push the Solver{'}s limit, while the Solver is rewarded for solving them correctly, and both are updated jointly. Experimental results show that PasoDoble can improve the math reasoning performance of LLMs."
}Markdown (Informal)
[Better LLM Reasoning via Dual-Play](https://preview.aclanthology.org/ingest-acl/2026.findings-acl.1752/) (Zhang et al., Findings 2026)
ACL
- Zhengxin Zhang, Chengyu Huang, Aochong Oliver Li, and Claire Cardie. 2026. Better LLM Reasoning via Dual-Play. In Findings of the Association for Computational Linguistics: ACL 2026, pages 35111–35139, San Diego, California, United States. Association for Computational Linguistics.