@inproceedings{yuan-etal-2026-learning,
title = "Learning on Imbalanced Noisy Data via Debiased Sample Selection and {LLM}-Driven Annotation",
author = "Yuan, Bo and
Chen, Yulin and
Zhang, Yin",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {ACL} 2026",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl/2026.findings-acl.1526/",
pages = "30504--30542",
ISBN = "979-8-89176-395-1",
abstract = "Learning with Noisy Labels (LNL) is a challenge where the collected training set can contain incorrect or corrupted labels. Most existing solutions distinguish clean samples from noisy samples and query human experts on noisy samples for denoising. However, these solutions often operate under the unrealistic assumption that the distribution of classes is uniform, overlooking the skewed and imbalanced distributions frequently encountered in real-world scenarios. In this case, we empirically reveal that previous solutions suffer from both selection bias and training bias, leading to distinguish clean samples from noisy samples hardly. In this paper, our work introduces the imbalanced learning with noisy labels (i-LNL) task, which seeks to let the model learn from noisy labels within imbalanced distributions. A new benchmark (ImbaLNL-Bench) comprised of some synthetic and real-world datasets is created to provide a thorough representation of practical use cases. Besides, we propose an innovative collaborative learning framework DeCo for i-LNL tasks. Specifically, we first conduct debiased sample selection, consisting of a robust expert model and a debiased-enhanced threshold strategy, to better separate clean samples from noisy samples, especially for the tail classes. Then we feed selected clean samples to active annotator large language models (LLMs) for re-annotating noisy samples using in-context learning, which can better reduce human effort. Ultimately, we employ distinct loss functions adept at managing subsets with varying degrees of label noise. Extensive experimental results on synthetic and real-world datasets show the effectiveness and superiority of our method."
}Markdown (Informal)
[Learning on Imbalanced Noisy Data via Debiased Sample Selection and LLM-Driven Annotation](https://preview.aclanthology.org/ingest-acl/2026.findings-acl.1526/) (Yuan et al., Findings 2026)
ACL