@inproceedings{bukkapatnam-malik-2026-schemascope,
title = "{S}chema{S}cope: How Join-Hop Depth Breaks Text-to-{SQL} in Large Language Models, and a Decomposition-Based Remedy",
author = "Bukkapatnam, Kaustubh S. and
Malik, Rayan",
editor = "Gupta, Vivek and
Ding, Kaize and
Kokel, Harsha and
Zhao, Yue and
Agarwal, Amit and
Wang, Yu and
Glass, Michael and
Zhang, Yu and
Srinivas, Kavitha and
Chen, Xiusi and
Hassanzadeh, Oktie and
Zhu, Qi and
Chang, Shuaichen and
Luo, Yuan",
booktitle = "Proceedings of the First Workshop on Structured Understanding, Retrieval, and Generation in the {LLM} Era ({SURG}e{LLM} 2026)",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl-workshops/2026.surgellm-1.17/",
pages = "269--274",
ISBN = "979-8-89176-406-4",
abstract = "Large language models (LLMs) achieve impressive accuracy on standard Text-to-SQL benchmarks such as Spider and BIRD, yet enterprise databases, with hundreds of tables and complex foreign key graphs, remain a practical bottleneck. We hypothesize that a single, measurable property drives most of this gap: the join-hop depth ($h$) of the query, defined as the number of foreign key edges that must be traversed to gather all required columns. We introduce the Join-Hop Depth (JHD) benchmark, 410 human-annotated questions stratified by $h \in \{1, \ldots, 6\}$ over 12 enterprise-scale schemas. Experiments on five frontier LLMs confirm a sharp accuracy cliff: all models exceed 80{\%} at $h = 1$ but fall below 40{\%} at $h = 4$ and below 25{\%} at $h = 6$, the typical depth of real enterprise analytics queries. To address this, we propose SchemaScope, a decomposition framework that partitions deep queries into a sequence of sub-queries with $h \leq 2$, executes them independently, and merges the results. SchemaScope raises execution accuracy from 46.8{\%} to 67.3{\%} on JHD (GPT-4o, $h \geq 3$) and improves execution accuracy by $+9.3$ percentage points on the BIRD development set. Error analysis shows that decomposition eliminates \textit{wrong join path} errors, the dominant failure mode at high $h$, and shifts the residual error budget toward condition and aggregation mistakes that are amenable to existing post-processing methods."
}