@inproceedings{singh-etal-2025-mtabvqa,
title = "{MT}ab{VQA}: Evaluating Multi-Tabular Reasoning of Language Models in Visual Space",
author = "Singh, Anshul and
Biemann, Chris and
Strich, Jan",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2025",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/author-page-yu-wang-polytechnic/2025.findings-emnlp.1083/",
doi = "10.18653/v1/2025.findings-emnlp.1083",
pages = "19866--19891",
ISBN = "979-8-89176-335-7",
abstract = "Vision-Language Models (VLMs) have demonstrated remarkable capabilities in interpreting visual layouts and text. However, a significant challenge remains in their ability to interpret robustly and reason over multi-tabular data presented as images, a common occurrence in real-world scenarios like web pages and digital documents. Existing benchmarks typically address single tables or non-visual data (text/structured). This leaves a critical gap: they don{'}t assess the ability to parse diverse table images, correlate information across them, and perform multi-hop reasoning on the combined visual data. To bridge this evaluation gap, we introduce MTabVQA, a novel benchmark specifically designed for multi-tabular visual question answering. MTabVQA comprises 3,745 complex question-answer pairs that necessitate multi-hop reasoning across several visually rendered table images. We provide extensive benchmark results for state-of-the-art VLMs on MTabVQA, revealing significant performance limitations. We further investigate post-training techniques to enhance these reasoning abilities and release MTabVQA-Instruct, a large-scale instruction-tuning dataset. Our experiments show that fine-tuning VLMs with MTabVQA-Instruct substantially improves their performance on visual multi-tabular reasoning. Code and dataset are available online: ."
}Markdown (Informal)
[MTabVQA: Evaluating Multi-Tabular Reasoning of Language Models in Visual Space](https://preview.aclanthology.org/author-page-yu-wang-polytechnic/2025.findings-emnlp.1083/) (Singh et al., Findings 2025)
ACL