@inproceedings{zhang-etal-2023-closer,
title = "A Closer Look at Transformer Attention for Multilingual Translation",
author = "Zhang, Jingyi and
de Melo, Gerard and
Xu, Hongfei and
Chen, Kehai",
editor = "Koehn, Philipp and
Haddow, Barry and
Kocmi, Tom and
Monz, Christof",
booktitle = "Proceedings of the Eighth Conference on Machine Translation",
month = dec,
year = "2023",
address = "Singapore",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/Add-Cong-Liu-Florida-Atlantic-University-author-id/2023.wmt-1.45/",
doi = "10.18653/v1/2023.wmt-1.45",
pages = "496--506",
abstract = "Transformers are the predominant model for machine translation. Recent works also showed that a single Transformer model can be trained to learn translation for multiple different language pairs, achieving promising results. In this work, we investigate how the multilingual Transformer model pays attention for translating different language pairs. We first performed automatic pruning to eliminate a large number of noisy heads and then analyzed the functions and behaviors of the remaining heads in both self-attention and cross-attention. We find that different language pairs, in spite of having different syntax and word orders, tended to share the same heads for the same functions, such as syntax heads and reordering heads. However, the different characteristics of different language pairs clearly caused interference in function heads and affected head accuracies. Additionally, we reveal an interesting behavior of the Transformer cross-attention: the deep-layer cross-attention heads work in a clear cooperative way to learn different options for word reordering, which can be caused by the nature of translation tasks having multiple different gold translations in the target language for the same source sentence."
}
Markdown (Informal)
[A Closer Look at Transformer Attention for Multilingual Translation](https://preview.aclanthology.org/Add-Cong-Liu-Florida-Atlantic-University-author-id/2023.wmt-1.45/) (Zhang et al., WMT 2023)
ACL