@inproceedings{ren-2025-lsrl,
title = "{LSRL}: Process-Supervised {GRPO} on Latent Recurrent States Improves Mathematical Reasoning",
author = "Ren, Hangliang",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2025",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/author-page-yu-wang-polytechnic/2025.findings-emnlp.669/",
doi = "10.18653/v1/2025.findings-emnlp.669",
pages = "12534--12545",
ISBN = "979-8-89176-335-7",
abstract = "Latent-recurrent language models solve tasks by iteratively refining hidden states rather than emitting chain-of-thought tokens, yet the opacity of those hidden trajectories hinders credit assignment and limits mathematical reasoning accuracy. We propose Latent-State Supervised Reinforcement Learning (LSRL), a process-supervised variant of Guided Reward Policy Optimization (GRPO) that delivers dense rewards at every latent step. We decode each recurrent depth of a 3.5-billion-parameter Huginn model and score the partial solutions with a GPT-4.1-nano grader aligned to final-answer correctness. Using LoRA adapters, we update the policy on a single NVIDIA L40S GPU with only 500 GSM-8K training problems. Relative to the depth-8 supervised Huginn baseline, LSRL improves absolute accuracy by +4.27 points on GSM-8K and +2.06 points on MathQA. These results demonstrate that rewarding latent steps provides an efficient route to stronger mathematical reasoning in latent-recurrent language models."
}Markdown (Informal)
[LSRL: Process-Supervised GRPO on Latent Recurrent States Improves Mathematical Reasoning](https://preview.aclanthology.org/author-page-yu-wang-polytechnic/2025.findings-emnlp.669/) (Ren, Findings 2025)
ACL