@inproceedings{shelat-etal-2026-beyond,
title = "Beyond Memorization: Testing {LLM} Reasoning on Unseen Theory of Computation Tasks",
author = "Shelat, Shlok and
Raval, Jay and
Roy, Souvik and
Gaur, Manas",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {ACL} 2026",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl/2026.findings-acl.1495/",
pages = "29904--29934",
ISBN = "979-8-89176-395-1",
abstract = "Large language models (LLMs) have demonstrated strong performance on formal language tasks, yet whether this reflects genuine symbolic reasoning or pattern matching on familiar constructions remains unclear. We introduce a benchmark for deterministic finite automata (DFA) construction from regular languages, comprising factual knowledge questions, seen construction problems from public sources, and two types of unseen problems: hand-crafted instances with multiple interacting constraints and systematically generated problems via Arden{'}s theorem. Models achieve perfect accuracy on factual questions and 84-90{\%} on seen tasks. However, accuracy drops sharply on unseen problems (by 30-64{\%}), with failures stemming from systematic misinterpretation of language constraints, incorrect handling of Kleene-star semantics, and a failure to preserve global consistency. We evaluate a three-stage hint protocol that enables correction of shallow errors but does not reliably resolve globally inconsistent or structurally flawed automata. Our analysis across multiple prompting strategies (direct, Chain-of-Thought, Tree-of-Thought) reveals that errors persist regardless of prompting approach, exposing a fundamental gap between LLMs' ability to generate syntactically plausible DFAs and their capacity for semantically correct formal reasoning."
}Markdown (Informal)
[Beyond Memorization: Testing LLM Reasoning on Unseen Theory of Computation Tasks](https://preview.aclanthology.org/ingest-acl/2026.findings-acl.1495/) (Shelat et al., Findings 2026)
ACL