@inproceedings{you-etal-2020-hard,
title = "Hard-Coded {G}aussian Attention for Neural Machine Translation",
author = "You, Weiqiu and
Sun, Simeng and
Iyyer, Mohit",
editor = "Jurafsky, Dan and
Chai, Joyce and
Schluter, Natalie and
Tetreault, Joel",
booktitle = "Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics",
month = jul,
year = "2020",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/Add-Cong-Liu-Florida-Atlantic-University-author-id/2020.acl-main.687/",
doi = "10.18653/v1/2020.acl-main.687",
pages = "7689--7700",
abstract = "Recent work has questioned the importance of the Transformer`s multi-headed attention for achieving high translation quality. We push further in this direction by developing a {\textquotedblleft}hard-coded{\textquotedblright} attention variant without any learned parameters. Surprisingly, replacing all learned self-attention heads in the encoder and decoder with fixed, input-agnostic Gaussian distributions minimally impacts BLEU scores across four different language pairs. However, additionally, hard-coding cross attention (which connects the decoder to the encoder) significantly lowers BLEU, suggesting that it is more important than self-attention. Much of this BLEU drop can be recovered by adding just a single learned cross attention head to an otherwise hard-coded Transformer. Taken as a whole, our results offer insight into which components of the Transformer are actually important, which we hope will guide future work into the development of simpler and more efficient attention-based models."
}
Markdown (Informal)
[Hard-Coded Gaussian Attention for Neural Machine Translation](https://preview.aclanthology.org/Add-Cong-Liu-Florida-Atlantic-University-author-id/2020.acl-main.687/) (You et al., ACL 2020)
ACL