@inproceedings{xia-etal-2025-realistic,
title = "Realistic Training Data Generation and Rule Enhanced Decoding in {LLM} for {N}ame{G}uess",
author = "Xia, Yikuan and
Chen, Jiazun and
Li, Sujian and
Gao, Jun",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-emnlp/2025.emnlp-main.357/",
pages = "7001--7018",
ISBN = "979-8-89176-332-6",
abstract = "The wide use of abbreviated column names (derived from English words or Chinese Pinyin) in database tables poses significant challenges for table-centric tasks in natural language processing and database management. Such a column name expansion task, referred to as the NameGuess task, has previously been addressed by fine-tuning Large Language Models (LLMs) on synthetically generated rule-based data. However, the current approaches yield suboptimal performance due to two fundamental limitations: 1) the rule-generated abbreviation data fails to reflect real-world distribution, and 2) the failure of LLMs to follow the rule-sensitive patterns in NameGuess persistently. For the data realism issue, we propose a novel approach that integrates a subsequence abbreviation generator trained on human-annotated data and collects non-subsequence abbreviations to improve the training set. For the rule violation issue, we propose a decoding system constrained on an automaton that represents the rules of abbreviation expansion. We extended the original English NameGuess test set to include non-subsequence and PinYin scenarios. Experimental results show that properly tuned 7/8B moderate-size LLMs with a refined decoding system can surpass the few-shot performance of state-of-the-art LLMs, such as the GPT-4 series. The code and data are presented in the supplementary material."
}Markdown (Informal)
[Realistic Training Data Generation and Rule Enhanced Decoding in LLM for NameGuess](https://preview.aclanthology.org/ingest-emnlp/2025.emnlp-main.357/) (Xia et al., EMNLP 2025)
ACL