@inproceedings{gubelmann-hongler-2026-fast,
title = "Too Fast, Too Shallow {--} {LLM}s, Including Reasoning {LLM}s, Are Unreliable Constitutional Reasoners",
author = "Gubelmann, Reto and
Hongler, Peter",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {ACL} 2026",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl/2026.findings-acl.2016/",
pages = "40554--40572",
ISBN = "979-8-89176-395-1",
abstract = "We assess LLMs' constitutional reasoning abilities using three different, newly developed datasets on three different constitutional questions in three different constitutional frameworks, comprising two different languages; the structure and content of the datasets is informed by legal expertise and grounded in the state of the art in philosophy of language. Our results indicate that the 19 LLMs tested, including the reasoning LLMs, while not being uniformly subject to political bias, are still not reliable constitutional reasoners, as they are heavily influenced by logically irrelevant aspects of the reasoning. Of the 196k evaluations run in our main experiment, the LLMs label less than 70{\%} correctly, and open-weight reasoning LLMs as well as gpt-4o are outperformed by moderately sized open-weight non-reasoning LLMs. None of the LLMs tested consistently show slow, systematic, rule-based system 2 thinking."
}Markdown (Informal)
[Too Fast, Too Shallow – LLMs, Including Reasoning LLMs, Are Unreliable Constitutional Reasoners](https://preview.aclanthology.org/ingest-acl/2026.findings-acl.2016/) (Gubelmann & Hongler, Findings 2026)
ACL