@inproceedings{yu-etal-2025-revisiting,
title = "Revisiting Intermediate-Layer Matching in Knowledge Distillation: Layer-Selection Strategy Doesn{'}t Matter (Much)",
author = "Yu, Zony and
Wen, Yuqiao and
Mou, Lili",
editor = "Inui, Kentaro and
Sakti, Sakriani and
Wang, Haofen and
Wong, Derek F. and
Bhattacharyya, Pushpak and
Banerjee, Biplab and
Ekbal, Asif and
Chakraborty, Tanmoy and
Singh, Dhirendra Pratap",
booktitle = "Proceedings of the 14th International Joint Conference on Natural Language Processing and the 4th Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics",
month = dec,
year = "2025",
address = "Mumbai, India",
publisher = "The Asian Federation of Natural Language Processing and The Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-ijcnlp-aacl/2025.findings-ijcnlp.105/",
pages = "1686--1694",
ISBN = "979-8-89176-303-6",
abstract = "Knowledge distillation (KD) is a popular method of transferring knowledge from a large ``teacher'' model to a small ``student'' model. Previous work has explored various layer-selection strategies (e.g., forward matching and in-order random matching) for intermediate-layer matching in KD, where a student layer is forced to resemble a certain teacher layer. In this work, we revisit such layer-selection strategies and observe an intriguing phenomenon that layer-selection strategy does not matter (much) in intermediate-layer matching{---}even seemingly nonsensical matching strategies such as *reverse matching* still result in surprisingly good student performance. We provide an interpretation for this phenomenon by examining the angles between teacher layers viewed from the student{'}s perspective. Our work sheds light on KD practice, as layer-selection strategies may not be the main focus of KD system design and vanilla forward matching works well in most setups."
}Markdown (Informal)
[Revisiting Intermediate-Layer Matching in Knowledge Distillation: Layer-Selection Strategy Doesn’t Matter (Much)](https://preview.aclanthology.org/ingest-ijcnlp-aacl/2025.findings-ijcnlp.105/) (Yu et al., Findings 2025)
ACL