@inproceedings{wang-balloccu-2026-arqa,
title = "{ARQA}: A Benchmark for Grounded Table{--}Text {QA} in Enterprise Annual Reports",
author = "Wang, Ruilong and
Balloccu, Simone",
editor = {Matusevych, Yevgen and
Eryi{\u{g}}it, G{\"u}l{\c{s}}en and
Aletras, Nikolaos},
booktitle = "Proceedings of the 19th Conference of the {E}uropean Chapter of the {A}ssociation for {C}omputational {L}inguistics (Volume 5: Industry Track)",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-eacl/2026.eacl-industry.63/",
pages = "847--868",
ISBN = "979-8-89176-384-5",
abstract = "Annual reports communicate corporate performance to stakeholders through dense tables and explanatory text, with rich grounding signals making automated reasoning challenging. Existing QA benchmarks focus on retrieval or single-modality reasoning and rarely require justification for answers with both textual and tabular evidence. We introduce ARQA (Annual Report QA), a benchmark of {\textasciitilde}2.5K QA pairs spanning ten fiscal years of automotive enterprise annual reports and three reasoning families {---} Lookup, Arithmetic, and Insight. Data are produced via a planner{--}generator pipeline, deterministically verified and recomputed, and fully reviewed by domain experts. We evaluate state-of-the-art instruction-tuned language models on ARQA, showing strong factual retrieval but persistent weaknesses in grounded arithmetic and causal reasoning. We release ARQA and its evaluation toolkit to facilitate research on auditable, evidence-first reasoning over enterprise documents. (https://github.com/RuilongWang/ARQA-Benchmark/)"
}Markdown (Informal)
[ARQA: A Benchmark for Grounded Table–Text QA in Enterprise Annual Reports](https://preview.aclanthology.org/ingest-eacl/2026.eacl-industry.63/) (Wang & Balloccu, EACL 2026)
ACL