@inproceedings{suzuki-etal-2025-aligning,
title = "Aligning Sizes of Intermediate Layers by {L}o{RA} Adapter for Knowledge Distillation",
author = "Suzuki, Takeshi and
Yamada, Hiroaki and
Tokunaga, Takenobu",
editor = "Drozd, Aleksandr and
Sedoc, Jo{\~a}o and
Tafreshi, Shabnam and
Akula, Arjun and
Shu, Raphael",
booktitle = "The Sixth Workshop on Insights from Negative Results in NLP",
month = may,
year = "2025",
address = "Albuquerque, New Mexico",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/moar-dois/2025.insights-1.10/",
doi = "10.18653/v1/2025.insights-1.10",
pages = "100--105",
ISBN = "979-8-89176-240-4",
abstract = "Intermediate Layer Distillation (ILD) is a variant of Knowledge Distillation (KD), a method for compressing neural networks.ILD requires mapping to align the intermediate layer sizes of the teacher and student models to compute the loss function in training, while this mapping is not used during inference.This inconsistency may reduce the effectiveness of learning in intermediate layers.In this study, we propose LoRAILD, which uses LoRA adapters to eliminate the inconsistency.However, our experimental results show that LoRAILD does not outperform existing methods.Furthermore, contrary to previous studies, we observe that conventional ILD does not outperform vanilla KD.Our analysis of the distilled models' intermediate layers suggests that ILD does not improve language models' performance."
}
Markdown (Informal)
[Aligning Sizes of Intermediate Layers by LoRA Adapter for Knowledge Distillation](https://preview.aclanthology.org/moar-dois/2025.insights-1.10/) (Suzuki et al., insights 2025)
ACL