@inproceedings{petersen-etal-2026-sense,
title = "Sense and Sensitivity: ``Reasoning'' Models are More Robust, but can Diverge from Human Consensus in a Legal Interpretation Task",
author = "Petersen, Dawson and
Purushothama, Abhishek and
Schneider, Nathan",
editor = "Bonial, Claire and
Berzak, Yevgeni",
booktitle = "Proceedings of the 30th Conference on Computational Natural Language Learning",
month = jul,
year = "2026",
address = "San Diego, California, USA",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl-workshops/2026.conll-main.4/",
pages = "45--56",
ISBN = "979-8-89176-410-1",
abstract = "Can LLMs make metalinguistic judgments? While LLM embeddings are often regarded as high-quality semantic representations, it is not clear that prompting an LLM is a useful way to obtain metalinguistic insights (e.g., whether a DIY gun kit is a ``firearm''). While some prior work has suggested LLM prompting can simulate surveys with human participants, computational studies in the domain of legal interpretation have found that LLMs are unreliable for metalinguistic judgments due to prompt sensitivity. However, these studies did not directly compare humans and LLMs on identical tasks, nor did they test so-called ``reasoning'' models. The current study addresses these gaps by directly comparing the robustness of human and LLM judgments (with and without $reasoning$) in an English-language legal interpretation task. Our results show that LLMs were more sensitive to irrelevant prompt features compared to human participants. Enabling $reasoning$ improved the stability of LLM responses. However, even $reasoning$ model outputs had only moderate correlations with human judgments, and all models sometimes output interpretations that no humans reached in response to the same prompt. We conclude that while $reasoning$ decreases prompt sensitivity, LLMs are still poor proxies for human metalinguistic judgments."
}Markdown (Informal)
[Sense and Sensitivity: “Reasoning” Models are More Robust, but can Diverge from Human Consensus in a Legal Interpretation Task](https://preview.aclanthology.org/ingest-acl-workshops/2026.conll-main.4/) (Petersen et al., CoNLL 2026)
ACL