@inproceedings{wolff-hulsebos-2025-well,
title = "How well do {LLM}s reason over tabular data, really?",
author = "Wolff, Cornelius and
Hulsebos, Madelon",
editor = "Chang, Shuaichen and
Hulsebos, Madelon and
Liu, Qian and
Chen, Wenhu and
Sun, Huan",
booktitle = "Proceedings of the 4th Table Representation Learning Workshop",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/acl25-workshop-ingestion/2025.trl-workshop.21/",
pages = "241--250",
ISBN = "979-8-89176-268-8",
abstract = "Large Language Models (LLMs) excel in natural language tasks, but less is known about their reasoning capabilities over tabular data. Prior analyses devise evaluation strategies that poorly reflect an LLM{'}s realistic performance on tabular queries. Moreover, we have a limited understanding of the robustness of LLMs towards realistic variations in tabular inputs. Therefore, we ask: Can general-purpose LLMs reason over tabular data, really?, and focus on two questions 1) are tabular reasoning capabilities of general-purpose LLMs robust to real-world characteristics of tabular inputs, and 2) how can we realistically evaluate an LLM{'}s performance on analytical tabular queries?Building on a recent tabular reasoning benchmark, we first surface shortcomings of its multiple-choice prompt evaluation strategy, as well as commonly used free-form text metrics such as SacreBleu and BERT-score. We show that an LLM-as-a-judge procedure yields more reliable performance insights and unveil a significant deficit in tabular reasoning performance of LLMs. We then extend the tabular inputs reflecting three common characteristics in practice: 1) missing values, 2) duplicate entities, and 3) structural variations. Experiments show that the tabular reasoning capabilities of general-purpose LLMs suffer from these variations, stressing the importance of improving their robustness for realistic tabular inputs."
}
Markdown (Informal)
[How well do LLMs reason over tabular data, really?](https://preview.aclanthology.org/acl25-workshop-ingestion/2025.trl-workshop.21/) (Wolff & Hulsebos, TRL 2025)
ACL