@inproceedings{cegin-etal-2025-llms,
title = "{LLM}s vs Established Text Augmentation Techniques for Classification: When do the Benefits Outweight the Costs?",
author = "Cegin, Jan and
Simko, Jakub and
Brusilovsky, Peter",
editor = "Chiruzzo, Luis and
Ritter, Alan and
Wang, Lu",
booktitle = "Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)",
month = apr,
year = "2025",
address = "Albuquerque, New Mexico",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/Ingest-2025-COMPUTEL/2025.naacl-long.526/",
pages = "10476--10496",
ISBN = "979-8-89176-189-6",
abstract = "The generative large language models (LLMs) are increasingly being used for data augmentation tasks, where text samples are LLM-paraphrased and then used for classifier fine-tuning. Previous studies have compared LLM-based augmentations with established augmentation techniques, but the results are contradictory: some report superiority of LLM-based augmentations, while other only marginal increases (and even decreases) in performance of downstream classifiers. A research that would confirm a clear cost-benefit advantage of LLMs over more established augmentation methods is largely missing. To study if (and when) is the LLM-based augmentation advantageous, we compared the effects of recent LLM augmentation methods with established ones on 6 datasets, 3 classifiers and 2 fine-tuning methods. We also varied the number of seeds and collected samples to better explore the downstream model accuracy space. Finally, we performed a cost-benefit analysis and show that LLM-based methods are worthy of deployment only when very small number of seeds is used. Moreover, in many cases, established methods lead to similar or better model accuracies."
}
Markdown (Informal)
[LLMs vs Established Text Augmentation Techniques for Classification: When do the Benefits Outweight the Costs?](https://preview.aclanthology.org/Ingest-2025-COMPUTEL/2025.naacl-long.526/) (Cegin et al., NAACL 2025)
ACL