@inproceedings{yang-etal-2025-doubling,
title = "Doubling Your Data in Minutes: Ultra-fast Tabular Data Generation via {LLM}-Induced Dependency Graphs",
author = "Yang, Shuo and
Zhang, Zheyu and
Prenkaj, Bardh and
Kasneci, Gjergji",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-emnlp/2025.emnlp-main.525/",
pages = "10348--10369",
ISBN = "979-8-89176-332-6",
abstract = "Tabular data is critical across diverse domains, yet high-quality datasets remain scarce due to privacy concerns and the cost of collection. Contemporary approaches adopt large language models (LLMs) for tabular augmentation, but exhibit two major limitations: (1) dense dependency modeling among tabular features that can introduce bias, and (2) high computational overhead in sampling. To address these issues, we propose SPADA for SPArse Dependency-driven Augmentation, a lightweight generative framework that explicitly captures sparse dependencies via an LLM-induced graph. We treat each feature as a node and synthesize values by traversing the graph, conditioning each feature solely on its parent nodes. We explore two synthesis strategies: a non-parametric method using Gaussian kernel density estimation, and a conditional normalizing flow model that learns invertible mappings for conditional density estimation. Experiments on four datasets show that SPADA reduces constraint violations by 4{\%} compared to diffusion-based methods and accelerates generation by nearly 9,500{\texttimes} over LLM-based baselines."
}Markdown (Informal)
[Doubling Your Data in Minutes: Ultra-fast Tabular Data Generation via LLM-Induced Dependency Graphs](https://preview.aclanthology.org/ingest-emnlp/2025.emnlp-main.525/) (Yang et al., EMNLP 2025)
ACL