@inproceedings{shen-etal-2025-condenselm,
title = "{C}ondense{LM}: {LLM}s-driven Text Dataset Condensation via Reward Matching",
author = "Shen, Cheng and
Ong, Yew-Soon and
Zhou, Joey Tianyi",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-emnlp/2025.emnlp-main.65/",
pages = "1237--1252",
ISBN = "979-8-89176-332-6",
abstract = "Dataset condensation has emerged as a promising technique to improve data efficiency under limited data budgets. However, when applied to the text level, existing methods struggle to compress more information into samples through optimization. Thus, these methods provide no obvious advantage over simpler coreset selection despite their high computational cost. In this paper, we introduce CondenseLM, a novel paradigm for both effective and efficient text-level dataset condensation. Our framework employs an LLMs-driven approach to sidestep the inherent limitations of existing methods, successfully generating more informative and less biased samples. In addition, it incorporates reward matching to align the LLMs-condensed dataset with the original dataset, maximizing representability and coverage. We conducted extensive experiments on SST-2, MNLI, AG News, and IMDB. Our approach outperforms both coreset selection and existing dataset condensation methods by large margins while also substantially reducing the computational cost."
}Markdown (Informal)
[CondenseLM: LLMs-driven Text Dataset Condensation via Reward Matching](https://preview.aclanthology.org/ingest-emnlp/2025.emnlp-main.65/) (Shen et al., EMNLP 2025)
ACL