@inproceedings{dong-etal-2025-relationalcoder,
title = "{R}elational{C}oder: Rethinking Complex Tables via Programmatic Relational Transformation",
author = "Dong, Haoyu and
Hu, Yue and
Peng, Huailiang and
Cao, Yanan",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingestion-acl-25/2025.acl-long.89/",
pages = "1771--1784",
ISBN = "979-8-89176-251-0",
abstract = "Semi-structured tables, with their varied layouts and formatting artifacts, remain a major obstacle for automated data processing and analytics. To address these challenges, we propose RelationalCoder, which uniformly converts semi-structured tables into relational data, enabling smooth integration with the rich ecosystem of data processing and analytics tools. By leveraging SQL code, RelationalCoder prevents schema errors and markedly improves normalization quality across multiple relational tables.To address the challenge of large tables, we propose a new technique called Loop Reference Decoding (LRD): it identifies \textit{expandable groups}{---}repeating regions of similar structure and semantics{---}and replicates each group using a concise loop over its repetitive region by referencing cell addresses, rather than regenerating each individual cell. This design substantially reduces output length from $\mathcal{O}(N \times M)${---}proportional to the table{'}s height ($N$) and width ($M$){---}to approximately $\mathcal{O}(K)$, where $K$ is the total number of unique cell types within detected expandable groups. As a result, LRD is highly scalable: the larger the input table, the greater the compression ratio. It scales seamlessly to extremely large tables, achieving output reductions of up to $100{,}000\times$.We further create the first human-labeled corpus for table transformation, created with a cost-efficient, actively supervised pipeline. Extensive experiments on HiTab and MultiHiertt show that RelationalCoder not only enables programmatic symbolic reasoning but also boosts QA accuracy{---}raising Llama-2 and Mistral models by more than 20{\%}, and GPT-4o by over 4{\%}. Project page: https://github.com/haoyudong/RelationalCoder."
}
Markdown (Informal)
[RelationalCoder: Rethinking Complex Tables via Programmatic Relational Transformation](https://preview.aclanthology.org/ingestion-acl-25/2025.acl-long.89/) (Dong et al., ACL 2025)
ACL