@inproceedings{zhang-yang-2026-crps,
title = "{CRPS}: Curriculum Replay via Progressive Suffixes from Successful Trajectories for Long-Horizon {LLM} Agents",
author = "Zhang, Zijing and
Yang, Xiajie",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {ACL} 2026",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl-workshops/2026.findings-acl.680/",
pages = "13891--13904",
ISBN = "979-8-89176-395-1",
abstract = "Long-horizon LLM agents trained with sparse terminal rewards tend to experience slow and unstable learning, and the issue is amplified by group-normalized on-policy objectives commonly used for LLM training (e.g., GRPO). When rollout groups collapse to nearly all failures early in training, within-group normalization yields degenerate advantages and weak learning signals. To address this, we propose Curriculum Replay via Progressive Suffixes from Successful Trajectories (CRPS), a lightweight RL-training strategy that turns serendipitous terminal successes into a within-trajectory curriculum. CRPS maintains a buffer of successful trajectories and restarts rollouts from suffix states, with an online controller adapting k to match agent competence and keep replay outcomes informative. Across ALFWorld and WebShop with different foundation models, CRPS consistently outperforms full-episode GRPO and naive experience replay. Group-level diagnostics further show that CRPS reduces degenerate groups ratio and increases within-group outcome diversity, aligning with faster and more stable training."
}Markdown (Informal)
[CRPS: Curriculum Replay via Progressive Suffixes from Successful Trajectories for Long-Horizon LLM Agents](https://preview.aclanthology.org/ingest-acl-workshops/2026.findings-acl.680/) (Zhang & Yang, Findings 2026)
ACL