@inproceedings{kim-etal-2026-efficiently,
title = "Efficiently Learning To Reason or Not to Reason: Root-token Policy Optimization for Adaptive Thinking",
author = "Kim, Taehyeon and
Lee, Hyunsoo and
Jang, Youngsoo and
Lee, Moontae",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Proceedings of the 64th Annual Meeting of the {A}ssociation for {C}omputational {L}inguistics (Volume 1: Long Papers)",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl/2026.acl-long.816/",
pages = "17934--17949",
ISBN = "979-8-89176-390-6",
abstract = "Large reasoning models (LRMs) achieve strong performance by externalizing explicit reasoning traces before producing the answer, yet suffer from overthinking challenge that allocates uniformly heavy computation to queries of varying difficulty. While proprietary models mitigate this via opaque routing, open-source LRMs still lack an efficient mechanism to internalize adaptive reasoning due to both expensive training cost and limited disclosure of training recipes. In response, we introduce RPO (Root-token Policy Optimization), a framework that enables LRMs to self-determine when to reason by training only the initial root token (e.g., whether to invoke the think tag) via group relative reward and group-wise advantages. By focusing on this pivotal branching point, RPO drastically reduces training overhead and VRAM usage. Across multiple model families and scales, RPO learns difficulty-aware adaptive thinking at just 2{\%} of the training compute of prior adaptive-reasoning methods."
}Markdown (Informal)
[Efficiently Learning To Reason or Not to Reason: Root-token Policy Optimization for Adaptive Thinking](https://preview.aclanthology.org/ingest-acl/2026.acl-long.816/) (Kim et al., ACL 2026)
ACL