@inproceedings{lee-etal-2025-critical,
title = "Critical Thinking: Which Kinds of Complexity Govern Optimal Reasoning Length?",
author = "Lee, Celine and
Rush, Alexander M and
Vafa, Keyon",
editor = "Inui, Kentaro and
Sakti, Sakriani and
Wang, Haofen and
Wong, Derek F. and
Bhattacharyya, Pushpak and
Banerjee, Biplab and
Ekbal, Asif and
Chakraborty, Tanmoy and
Singh, Dhirendra Pratap",
booktitle = "Proceedings of the 14th International Joint Conference on Natural Language Processing and the 4th Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics",
month = dec,
year = "2025",
address = "Mumbai, India",
publisher = "The Asian Federation of Natural Language Processing and The Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-ijcnlp-aacl/2025.ijcnlp-long.57/",
pages = "1038--1060",
ISBN = "979-8-89176-298-5",
abstract = "Large language models (LLMs) often benefit from verbalized reasoning at inference time, but it remains unclear which aspects of task difficulty these extra reasoning tokens address. To investigate this question, we construct a controlled setting where task complexity can be precisely manipulated to study its effect on reasoning length. Deterministic finite automata (DFAs) offer a formalism through which we can characterize task complexity through measurable properties such as run length (number of reasoning steps required) and state-space size (decision complexity). We first show that across different tasks and models of different sizes and training paradigms, there exists an optimal amount of reasoning tokens such that the probability of producing a correct solution is maximized. We then investigate which properties of complexity govern this critical length: we find that task instances with longer corresponding underlying DFA runs (i.e. demand greater latent state-tracking requirements) correlate with longer reasoning lengths, but, surprisingly, that DFA size (i.e. state-space complexity) does not. We then demonstrate an implication of these findings: being able to predict the optimal number of reasoning tokens for new problems and filtering out non-optimal length answers results in consistent accuracy improvements."
}Markdown (Informal)
[Critical Thinking: Which Kinds of Complexity Govern Optimal Reasoning Length?](https://preview.aclanthology.org/ingest-ijcnlp-aacl/2025.ijcnlp-long.57/) (Lee et al., IJCNLP-AACL 2025)
ACL
- Celine Lee, Alexander M Rush, and Keyon Vafa. 2025. Critical Thinking: Which Kinds of Complexity Govern Optimal Reasoning Length?. In Proceedings of the 14th International Joint Conference on Natural Language Processing and the 4th Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics, pages 1038–1060, Mumbai, India. The Asian Federation of Natural Language Processing and The Association for Computational Linguistics.