@inproceedings{safarzadeh-etal-2025-evaluating,
title = "Evaluating {NL}2{SQL} via {SQL}2{NL}",
author = "Safarzadeh, Mohammadtaher and
Oroojlooy, Afshin and
Roth, Dan",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2025",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/name-variant-enfa-fane/2025.findings-emnlp.1031/",
doi = "10.18653/v1/2025.findings-emnlp.1031",
pages = "18954--18968",
ISBN = "979-8-89176-335-7",
abstract = "Robust evaluation in the presence of linguistic variation is key to understanding the generalization capabilities of Natural Language to SQL (NL2SQL) models, yet existing benchmarks rarely address this factor in a systematic or controlled manner. We propose a novel schema-aligned paraphrasing framework that leverages SQL-to-NL (SQL2NL) to automatically generate semantically equivalent, lexically diverse queries while maintaining alignment with the original schema and intent. This enables the first targeted evaluation of NL2SQL robustness to linguistic variation in isolation-distinct from prior work that primarily investigates ambiguity or schema perturbations. Ouranalysis reveals that state-of-the-art models are far more brittle than standard benchmarks suggest. For example, LLaMa3.3-70B exhibits a 10.23{\%} drop in execution accuracy (from 77.11{\%} to 66.9{\%}) on paraphrased Spider queries, while LLaMa3.1-8B suffers an even larger drop of nearly 20{\%} (from 62.9{\%} to 42.5{\%}). Smaller models (e.g., GPT-4o mini) are disproportionately affected. We also find that robustness degradation varies significantly with query complexity, dataset, and domain- highlighting the need for evaluation frameworks that explicitly measure linguistic generalization to ensure reliable performance in real-world settings."
}Markdown (Informal)
[Evaluating NL2SQL via SQL2NL](https://preview.aclanthology.org/name-variant-enfa-fane/2025.findings-emnlp.1031/) (Safarzadeh et al., Findings 2025)
ACL
- Mohammadtaher Safarzadeh, Afshin Oroojlooy, and Dan Roth. 2025. Evaluating NL2SQL via SQL2NL. In Findings of the Association for Computational Linguistics: EMNLP 2025, pages 18954–18968, Suzhou, China. Association for Computational Linguistics.