@inproceedings{zhu-etal-2021-gaml,
title = "{GAML}-{BERT}: Improving {BERT} Early Exiting by Gradient Aligned Mutual Learning",
author = "Zhu, Wei and
Wang, Xiaoling and
Ni, Yuan and
Xie, Guotong",
editor = "Moens, Marie-Francine and
Huang, Xuanjing and
Specia, Lucia and
Yih, Scott Wen-tau",
booktitle = "Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2021",
address = "Online and Punta Cana, Dominican Republic",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/fix-sig-urls/2021.emnlp-main.242/",
doi = "10.18653/v1/2021.emnlp-main.242",
pages = "3033--3044",
abstract = "In this work, we propose a novel framework, Gradient Aligned Mutual Learning BERT (GAML-BERT), for improving the early exiting of BERT. GAML-BERT{'}s contributions are two-fold. We conduct a set of pilot experiments, which shows that mutual knowledge distillation between a shallow exit and a deep exit leads to better performances for both. From this observation, we use mutual learning to improve BERT{'}s early exiting performances, that is, we ask each exit of a multi-exit BERT to distill knowledge from each other. Second, we propose GA, a novel training method that aligns the gradients from knowledge distillation to cross-entropy losses. Extensive experiments are conducted on the GLUE benchmark, which shows that our GAML-BERT can significantly outperform the state-of-the-art (SOTA) BERT early exiting methods."
}
Markdown (Informal)
[GAML-BERT: Improving BERT Early Exiting by Gradient Aligned Mutual Learning](https://preview.aclanthology.org/fix-sig-urls/2021.emnlp-main.242/) (Zhu et al., EMNLP 2021)
ACL