@inproceedings{azeemi-etal-2026-language,
title = "Language Model-Driven Data Pruning Enables Efficient Active Learning",
author = "Azeemi, Abdul Hameed and
Qazi, Ihsan Ayyub and
Raza, Agha Ali",
editor = "Demberg, Vera and
Inui, Kentaro and
Marquez, Llu{\'i}s",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {EACL} 2026",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-eacl/2026.findings-eacl.229/",
pages = "4373--4392",
ISBN = "979-8-89176-386-9",
abstract = "Active learning (AL) optimizes data labeling efficiency by selecting the most informative instances for annotation. However, scaling active learning to large datasets remains a critical challenge, as AL acquisition functions incur prohibitive computational costs when evaluating large unlabeled data pools. To bridge this gap, we introduce a novel plug-and-play data pruning strategy, ActivePrune, which leverages language models to prune the unlabeled pool. ActivePrune implements a two-stage pruning process: an initial fast evaluation using perplexity scores from an n-gram language model, followed by a high-quality selection using metrics for data quality computed through a quantized LLM. To enhance the diversity of the unlabeled pool, we propose a novel perplexity reweighting method that systematically brings forward underrepresented instances for selection. Experiments on translation, sentiment analysis, topic classification, and summarization tasks on diverse datasets and AL strategies demonstrate that ActivePrune outperforms existing data pruning methods. Finally, we compare the selection quality $\leftrightarrow$ efficiency tradeoff of the data pruning methods and show that ActivePrune provides up to 74{\%} reduction in the end-to-end AL time compared to other LLM score-based pruning methods."
}Markdown (Informal)
[Language Model-Driven Data Pruning Enables Efficient Active Learning](https://preview.aclanthology.org/ingest-eacl/2026.findings-eacl.229/) (Azeemi et al., Findings 2026)
ACL