@inproceedings{sanz-guerrero-von-der-wense-2025-mitigating,
title = "Mitigating Label Length Bias in Large Language Models",
author = "Sanz-Guerrero, Mario and
von der Wense, Katharina",
editor = "Inui, Kentaro and
Sakti, Sakriani and
Wang, Haofen and
Wong, Derek F. and
Bhattacharyya, Pushpak and
Banerjee, Biplab and
Ekbal, Asif and
Chakraborty, Tanmoy and
Singh, Dhirendra Pratap",
booktitle = "Proceedings of the 14th International Joint Conference on Natural Language Processing and the 4th Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics",
month = dec,
year = "2025",
address = "Mumbai, India",
publisher = "The Asian Federation of Natural Language Processing and The Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-ijcnlp-aacl/2025.ijcnlp-long.78/",
pages = "1404--1420",
ISBN = "979-8-89176-298-5",
abstract = "Large language models (LLMs) are powerful zero- and few-shot learners. However, when predicting over a set of candidate options, LLMs suffer from label biases, and existing calibration methods overlook biases arising from multi-token class labels. We tackle an issue we call *label length bias*, where labels of different lengths are treated inconsistently, even after standard length normalization. To mitigate it, we propose *normalized contextual calibration* (NCC), an effective method that normalizes and calibrates predictions at the full-label level. NCC achieves statistically significant improvements over prior approaches across multiple datasets and models, with gains of up to 10{\%} F1. Moreover, NCC extends bias mitigation to broader tasks such as multiple-choice question answering. Our analysis shows that, when combined with in-context learning, NCC is less sensitive to few-shot example selection, requires fewer examples for competitive performance, and produces more reliable confidence estimates. These findings highlight the importance of mitigating full-label biases to improve the performance and robustness of LLM-based methods, particularly in real-world applications where class labels naturally consist of multiple tokens."
}Markdown (Informal)
[Mitigating Label Length Bias in Large Language Models](https://preview.aclanthology.org/ingest-ijcnlp-aacl/2025.ijcnlp-long.78/) (Sanz-Guerrero & von der Wense, IJCNLP-AACL 2025)
ACL
- Mario Sanz-Guerrero and Katharina von der Wense. 2025. Mitigating Label Length Bias in Large Language Models. In Proceedings of the 14th International Joint Conference on Natural Language Processing and the 4th Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics, pages 1404–1420, Mumbai, India. The Asian Federation of Natural Language Processing and The Association for Computational Linguistics.