@inproceedings{zhou-etal-2020-scheduled,
title = "Scheduled {D}rop{H}ead: A Regularization Method for Transformer Models",
author = "Zhou, Wangchunshu and
Ge, Tao and
Wei, Furu and
Zhou, Ming and
Xu, Ke",
editor = "Cohn, Trevor and
He, Yulan and
Liu, Yang",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2020",
month = nov,
year = "2020",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/fix-sig-urls/2020.findings-emnlp.178/",
doi = "10.18653/v1/2020.findings-emnlp.178",
pages = "1971--1980",
abstract = "We introduce DropHead, a structured dropout method specifically designed for regularizing the multi-head attention mechanism which is a key component of transformer. In contrast to the conventional dropout mechanism which randomly drops units or connections, DropHead drops entire attention heads during training to prevent the multi-head attention model from being dominated by a small portion of attention heads. It can help reduce the risk of overfitting and allow the models to better benefit from the multi-head attention. Given the interaction between multi-headedness and training dynamics, we further propose a novel dropout rate scheduler to adjust the dropout rate of DropHead throughout training, which results in a better regularization effect. Experimental results demonstrate that our proposed approach can improve transformer models by 0.9 BLEU score on WMT14 En-De translation task and around 1.0 accuracy for various text classification tasks."
}
Markdown (Informal)
[Scheduled DropHead: A Regularization Method for Transformer Models](https://preview.aclanthology.org/fix-sig-urls/2020.findings-emnlp.178/) (Zhou et al., Findings 2020)
ACL