@inproceedings{zhou-etal-2026-policy,
title = "Policy-Guided Stepwise Action Planning for Controllable {LLM} Reasoning",
author = "Zhou, Jianpeng and
Hu, Qisheng and
Wang, Jiahai and
Wang, Wenya",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {ACL} 2026",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl/2026.findings-acl.2024/",
pages = "40740--40765",
ISBN = "979-8-89176-395-1",
abstract = "Steering large language model (LLM) reasoning via high-level reasoning actions offers a promising approach to improve robustness and interpretability. However, existing action-based paradigms, ranging from training-free prompting to static plan retrieval or prediction, often fail to consistently outperform standard generation because their planners tend to degenerate into repetitive loops or fixed patterns. We propose PG-HAP (Policy-Guided High-Level Action Planning), a lightweight stepwise planner{--}executor framework that learns to select reasoning actions dynamically while keeping the executor LLM fully frozen. The planner is trained with reinforcement learning to optimize answer correctness. To prevent degeneration, we introduce two targeted mechanisms: (i) an \textit{Action-Dependency Logit Mask} that enforces valid transitions to avoid redundancy, and (ii) an \textit{Action Diversity Reward} that discourages mode collapse by promoting varied action sequences. Across mathematical and commonsense reasoning benchmarks, PG-HAP improves accuracy over strong baselines while producing less redundant, more adaptive trajectories. This demonstrates that learning high-level planning alone can substantially strengthen reasoning without expensive end-to-end model tuning."
}Markdown (Informal)
[Policy-Guided Stepwise Action Planning for Controllable LLM Reasoning](https://preview.aclanthology.org/ingest-acl/2026.findings-acl.2024/) (Zhou et al., Findings 2026)
ACL