@inproceedings{xu-etal-2025-distillation,
title = "Distillation versus Contrastive Learning: How to Train Your Rerankers",
author = "Xu, Zhichao and
Huang, Zhiqi and
Zhuang, Shengyao and
Srikumar, Vivek",
editor = "Inui, Kentaro and
Sakti, Sakriani and
Wang, Haofen and
Wong, Derek F. and
Bhattacharyya, Pushpak and
Banerjee, Biplab and
Ekbal, Asif and
Chakraborty, Tanmoy and
Singh, Dhirendra Pratap",
booktitle = "Proceedings of the 14th International Joint Conference on Natural Language Processing and the 4th Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics",
month = dec,
year = "2025",
address = "Mumbai, India",
publisher = "The Asian Federation of Natural Language Processing and The Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-ijcnlp-aacl/2025.findings-ijcnlp.33/",
pages = "564--578",
ISBN = "979-8-89176-303-6",
abstract = "Training effective text rerankers is crucial for information retrieval. Two strategies are widely used: contrastive learning (optimizing directly on ground-truth labels) and knowledge distillation (transferring knowledge from a larger reranker). While both have been studied extensively, a clear comparison of their effectiveness for training cross-encoder rerankers under practical conditions is needed.This paper empirically compares these strategies by training rerankers of different sizes (0.5B, 1.5B, 3B, 7B) and architectures (Transformer, Recurrent) using both methods on the same data, with a strong contrastive learning model acting as the distillation teacher. Our results show that knowledge distillation generally yields better in-domain and out-of-domain ranking performance than contrastive learning when distilling from a more performant teacher model. This finding is consistent across student model sizes and architectures. However, distilling from a teacher of the same capacity does not provide the same advantage, particularly for out-of-domain tasks. These findings offer practical guidance for choosing a training strategy based on available teacher models. We recommend using knowledge distillation to train smaller rerankers if a larger, more performant teacher is accessible; in its absence, contrastive learning remains a robust baseline. Our code implementation is made available to facilitate reproducbility."
}Markdown (Informal)
[Distillation versus Contrastive Learning: How to Train Your Rerankers](https://preview.aclanthology.org/ingest-ijcnlp-aacl/2025.findings-ijcnlp.33/) (Xu et al., Findings 2025)
ACL
- Zhichao Xu, Zhiqi Huang, Shengyao Zhuang, and Vivek Srikumar. 2025. Distillation versus Contrastive Learning: How to Train Your Rerankers. In Proceedings of the 14th International Joint Conference on Natural Language Processing and the 4th Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics, pages 564–578, Mumbai, India. The Asian Federation of Natural Language Processing and The Association for Computational Linguistics.