@inproceedings{gautam-etal-2026-indotabvqa,
title = "{INDOTABVQA}: A Benchmark for Cross-Lingual Table Understanding in {B}ahasa {I}ndonesia Documents",
author = "Gautam, Somraj and
Dravichi, Anathapindika and
Harit, Gaurav",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {ACL} 2026",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl/2026.findings-acl.1105/",
pages = "21969--21981",
ISBN = "979-8-89176-395-1",
abstract = "We introduce INDOTABVQA, a benchmark for evaluating cross-lingual Table Visual Question Answering (VQA) on real-world document images in Bahasa Indonesia. The dataset comprises 1,593 document images across three visual styles (bordered, borderless, and colorful) with one or more tables, and 1,593 question-answer sets in four languages: Bahasa Indonesia, English, Hindi, and Arabic. This enables evaluation of Vision-Language Models (VLMs) in both monolingual (Bahasa documents with Bahasa questions) and cross-lingual settings (Bahasa documents with questions in other languages). We benchmark leading open-source VLMs (Qwen2.5-VL, Gemma- 3, LLaMA-3.2) and GPT-4o and reveal substantial performance gaps, particularly on structurally complex tables and in low-resource languages. Fine-tuning a compact 3B model and a LoRA- finetuned 7B model on our dataset yields 11.6{\%} and 17.8{\%} improvements in accuracy. Providing explicit table region coordinates as additional input further improves performance by 4-7{\%}, demonstrating the value of Spatial priors for table-based reasoning. Our findings underscore the importance of language- diverse, domain-specific datasets and demonstrate that targeted fine-tuning can significantly enhance VLM performance on specialized document understanding tasks. INDOTABVQA provides a valuable resource for advancing research in cross-lingual, structure-aware document understanding, especially in underrepresented regions of the world. The dataset is publicly available via Hugging Face at: https://huggingface.co/datasets/NusaBharat/INDOTABVQA."
}Markdown (Informal)
[INDOTABVQA: A Benchmark for Cross-Lingual Table Understanding in Bahasa Indonesia Documents](https://preview.aclanthology.org/ingest-acl/2026.findings-acl.1105/) (Gautam et al., Findings 2026)
ACL