@inproceedings{xu-etal-2025-self-distillation,
title = "A Self-Distillation Recipe for Neural Machine Translation",
author = "Xu, Hongfei and
Liang, Zhuofei and
Liu, Qiuhui and
Mu, Lingling",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2025",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/display_plenaries/2025.findings-acl.261/",
pages = "5050--5064",
ISBN = "979-8-89176-256-5",
abstract = "Self-distillation distills the deeper sub-networks to the shallower sub-networks without using an extra teacher model, and has been proven effective in improving the performance of a series of computer vision tasks. In this paper, we study the representation-based self-distillation methods for Neural Machine Translation (NMT) considering the efficiency issue with a large vocabulary. We present a rank-order augmented Pearson correlation loss and an iterative distillation method to prevent the discrepancy of predictions between the student and a stronger teacher from disturbing the training. To prevent the teacher from misleading the student{'}s learning, we utilize a warm-up strategy and present a gradient adaption method to scale down or zero the Knowledge Distillation (KD) gradients which are opposite to the translation. Experiments show that our method can lead to significant improvements over the strong Transformer baseline on low/middle/high-resource tasks, obtaining comparable performance to previous MT KD studies without pre-training a teacher. Deeper Transformer experiments show that our method can lead to comparable or better performance with fewer layers."
}
Markdown (Informal)
[A Self-Distillation Recipe for Neural Machine Translation](https://preview.aclanthology.org/display_plenaries/2025.findings-acl.261/) (Xu et al., Findings 2025)
ACL