@inproceedings{dong-etal-2026-self,
title = "Self-{S}oft{C}o{T}: A Self-Consistent Framework via Position-Aware Latent Space Reinforcement Learning",
author = "Dong, Liangliang and
Shan, Lianlei and
Li, Shuaimin",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Proceedings of the 64th Annual Meeting of the {A}ssociation for {C}omputational {L}inguistics (Volume 1: Long Papers)",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl/2026.acl-long.1496/",
pages = "32393--32414",
ISBN = "979-8-89176-390-6",
abstract = "While Chain-of-Thought (CoT) reasoning empowers Large Language Models (LLMs) to tackle complex tasks, its reliance on discrete token decoding imposes an inherent Discreteness Bottleneck, limiting expressiveness within a restricted vocabulary space. Existing continuous reasoning approaches, such as SoftCoT, mitigate this but typically rely on external auxiliary models, resulting in complex deployment and fractured inference pipelines. To address these challenges, we propose Self-SoftCoT, a self-contained framework that enables a frozen LLM to internally generate and consume latent thoughts without external assistants. By establishing a single-stream ``Thinking {\textrightarrow} Speaking'' closed-loop, we decouple latent planning from explicit generation. Furthermore, we adopt Group Sequence Policy Optimization (GSPO) to stabilize learning and employ Position-Aware Independent Projection to mitigate representation homogenization. Experimental results on five reasoning benchmarks demonstrate that our method significantly improves the reasoning performance of frozen LLMs. Specifically, our Qwen2.5-based model uses only N=2 soft tokens to outperform the SoftCoT baseline (N=4), improving the average accuracy from 75.06{\%} to 78.42{\%}. Similarly, LLaMA-3.1 performance increases from 70.52{\%} to 74.55{\%}."
}Markdown (Informal)
[Self-SoftCoT: A Self-Consistent Framework via Position-Aware Latent Space Reinforcement Learning](https://preview.aclanthology.org/ingest-acl/2026.acl-long.1496/) (Dong et al., ACL 2026)
ACL