@inproceedings{gemechu-etal-2025-natural,
title = "Natural Language Reasoning in Large Language Models: Analysis and Evaluation",
author = "Gemechu, Debela and
Ruiz-Dolz, Ramon and
Beyer, Henrike and
Reed, Chris",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2025",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/display_plenaries/2025.findings-acl.192/",
pages = "3717--3741",
ISBN = "979-8-89176-256-5",
abstract = "While Large Language Models (LLMs) have demonstrated promising results on a range of reasoning benchmarks{---}particularly in formal logic, mathematical tasks, and Chain-of-Thought prompting{---}less is known about their capabilities in unconstrained natural language reasoning. Argumentative reasoning, a form of reasoning naturally expressed in language and central to everyday discourse, presents unique challenges for LLMs due to its reliance on context, implicit assumptions, and value judgments. This paper addresses a gap in the study of reasoning in LLMs by presenting the first large-scale evaluation of their unconstrained natural language reasoning capabilities based on natural language argumentation. The paper offers three contributions: (i) the formalisation of a new strategy designed to evaluate argumentative reasoning in LLMs: argument-component selection; (ii) the creation of the Argument Reasoning Tasks (ART) dataset, a new benchmark for argument-component selection based on argument structures for natural language reasoning; and (iii) an extensive experimental analysis involving four different models, demonstrating the limitations of LLMs on natural language reasoning tasks."
}
Markdown (Informal)
[Natural Language Reasoning in Large Language Models: Analysis and Evaluation](https://preview.aclanthology.org/display_plenaries/2025.findings-acl.192/) (Gemechu et al., Findings 2025)
ACL