@inproceedings{bukkapatnam-mehta-2026-tabfaith,
title = "{T}ab{F}aith: Benchmarking and Improving Structural Faithfulness in {LLM} Table Summarization",
author = "Bukkapatnam, Kaustubh S. and
Mehta, Sohum",
editor = "Gupta, Vivek and
Ding, Kaize and
Kokel, Harsha and
Zhao, Yue and
Agarwal, Amit and
Wang, Yu and
Glass, Michael and
Zhang, Yu and
Srinivas, Kavitha and
Chen, Xiusi and
Hassanzadeh, Oktie and
Zhu, Qi and
Chang, Shuaichen and
Luo, Yuan",
booktitle = "Proceedings of the First Workshop on Structured Understanding, Retrieval, and Generation in the {LLM} Era ({SURG}e{LLM} 2026)",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl-workshops/2026.surgellm-1.21/",
pages = "326--332",
ISBN = "979-8-89176-406-4",
abstract = "When large language models (LLMs) summarize tabular data, they produce fluent but systematically unfaithful text{---}hallucinating numerical values, misattributing entities to rows or columns, fabricating comparative rankings, and conflating temporal references. Existing faithfulness metrics (BLEU, PARENT, BERTScore) are poorly correlated with human judgments of structural faithfulness (r {\ensuremath{\leq}}0.60) because they are agnostic to the table{'}s schema and cell structure. We introduce TABFAITH, a benchmark of 2,400 (table, summary, error annotation) triples across five structural error types, built from ToTTo and a new enterprise table summarization dataset (TabSum-Ent) covering financial reports, clinical notes, and operational dashboards. We further propose STAF (Structural Table-Aware Faithfulness), a reference-free metric that decomposes faithfulness verification into cell-level claim alignment using natural language inference over table cells. STAF achieves r = 0.94 with human faithfulness judgments{---}a +0.34 improvement over PARENT (r = 0.60) and +0.70 over BLEU (r = 0.24). Guided by STAF{'}s fine-grained signal, we design CAVE (Cell-Anchored Verification and Editing), a training-free post-processing method that identifies unfaithful claims, traces them to specific table cells, and re-generates the offending spans. CAVE improves STAF scores by +0.14 on average across five LLMs on both ToTTo and TabSum-Ent, with the largest gains for numerical errors (+0.17){---}the dominant error type for smaller models."
}