@inproceedings{tran-etal-2026-exploiting,
title = "Exploiting Tree Structure for Credit Assignment in Reinforcement Learning with Large Language Models",
author = "Tran, Hieu and
Yao, Zonghai and
yu, Hong",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {ACL} 2026",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl/2026.findings-acl.524/",
pages = "10795--10810",
ISBN = "979-8-89176-395-1",
abstract = "Reinforcement learning has shown strong promise for strengthening the reasoning ability of large language models (LLMs), but sparse, delayed rewards over long chains make token-level credit assignment a central challenge. Actor{--}critic methods like PPO provide token-level credit but require training a value network alongside the policy, which introduces complexity and can encourage overfitting. Critic-free alternatives such as GRPO avoid this burden but rely on sequence-level outcomes, distributing a single reward uniformly across tokens and ignoring structural differences between responses. We propose Prefix-to-Tree (P2T), which organizes the sampled responses of a prompt into a prefix tree and computes nonparametric prefix values by aggregating descendant outcomes. Building on this idea, we develop TEMPO (Tree-Estimated Mean Prefix Value for Policy Optimization), a critic-free algorithm that enriches GRPO with branch-aware temporal-difference (TD) corrections. Across Qwen3-1.7B and Qwen3-4B, TEMPO consistently improves both convergence and final performance over PPO and GRPO on in-distribution benchmarks (MATH, MedQA) and out-of-distribution settings (GSM-HARD, AMC23, MedMCQA, MMLU-Medical), achieving higher validation accuracy within comparable wall-clock time."
}Markdown (Informal)
[Exploiting Tree Structure for Credit Assignment in Reinforcement Learning with Large Language Models](https://preview.aclanthology.org/ingest-acl/2026.findings-acl.524/) (Tran et al., Findings 2026)
ACL