@inproceedings{huang-etal-2025-noise,
title = "From Noise to Clarity: Filtering Real and {LLM}-Generated Samples for Enhanced Intent Detection",
author = "Huang, Junbao and
Li, Weizhen and
Huang, Peijie and
Xu, Yuhong",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2025",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/author-page-yu-wang-polytechnic/2025.findings-emnlp.1186/",
doi = "10.18653/v1/2025.findings-emnlp.1186",
pages = "21736--21746",
ISBN = "979-8-89176-335-7",
abstract = "In dialogue intent detection, the challenge of acquiring sufficient corpora and the high cost of manual annotation often lead to incorrectly labeled or unrepresentative samples, which can hinder the generalization ability of classification models. Additionally, as using large language models for generating synthetic samples for data augmentation becomes more common, these synthetic samples may exacerbate the problem by introducing additional noise due to the models' limited prior knowledge. To address this challenge, this paper proposes an interpretable Sample Filter by Topic Modeling (SFTM) framework. By evaluating the diversity and authenticity of the samples, SFTM effectively reduces the quantity of real and synthetic samples while improving the performance of the classification models. Our codes are publicly available at https://github.com/gumbouh/SFTM."
}Markdown (Informal)
[From Noise to Clarity: Filtering Real and LLM-Generated Samples for Enhanced Intent Detection](https://preview.aclanthology.org/author-page-yu-wang-polytechnic/2025.findings-emnlp.1186/) (Huang et al., Findings 2025)
ACL