@inproceedings{zhang-etal-2020-long,
title = "Long-Short Term Masking Transformer: A Simple but Effective Baseline for Document-level Neural Machine Translation",
author = "Zhang, Pei and
Chen, Boxing and
Ge, Niyu and
Fan, Kai",
booktitle = "Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP)",
month = nov,
year = "2020",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2020.emnlp-main.81",
doi = "10.18653/v1/2020.emnlp-main.81",
pages = "1081--1087",
abstract = "Many document-level neural machine translation (NMT) systems have explored the utility of context-aware architecture, usually requiring an increasing number of parameters and computational complexity. However, few attention is paid to the baseline model. In this paper, we research extensively the pros and cons of the standard transformer in document-level translation, and find that the auto-regressive property can simultaneously bring both the advantage of the consistency and the disadvantage of error accumulation. Therefore, we propose a surprisingly simple long-short term masking self-attention on top of the standard transformer to both effectively capture the long-range dependence and reduce the propagation of errors. We examine our approach on the two publicly available document-level datasets. We can achieve a strong result in BLEU and capture discourse phenomena.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="zhang-etal-2020-long">
<titleInfo>
<title>Long-Short Term Masking Transformer: A Simple but Effective Baseline for Document-level Neural Machine Translation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Pei</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Boxing</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Niyu</namePart>
<namePart type="family">Ge</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kai</namePart>
<namePart type="family">Fan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-nov</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP)</title>
</titleInfo>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Many document-level neural machine translation (NMT) systems have explored the utility of context-aware architecture, usually requiring an increasing number of parameters and computational complexity. However, few attention is paid to the baseline model. In this paper, we research extensively the pros and cons of the standard transformer in document-level translation, and find that the auto-regressive property can simultaneously bring both the advantage of the consistency and the disadvantage of error accumulation. Therefore, we propose a surprisingly simple long-short term masking self-attention on top of the standard transformer to both effectively capture the long-range dependence and reduce the propagation of errors. We examine our approach on the two publicly available document-level datasets. We can achieve a strong result in BLEU and capture discourse phenomena.</abstract>
<identifier type="citekey">zhang-etal-2020-long</identifier>
<identifier type="doi">10.18653/v1/2020.emnlp-main.81</identifier>
<location>
<url>https://aclanthology.org/2020.emnlp-main.81</url>
</location>
<part>
<date>2020-nov</date>
<extent unit="page">
<start>1081</start>
<end>1087</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Long-Short Term Masking Transformer: A Simple but Effective Baseline for Document-level Neural Machine Translation
%A Zhang, Pei
%A Chen, Boxing
%A Ge, Niyu
%A Fan, Kai
%S Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP)
%D 2020
%8 nov
%I Association for Computational Linguistics
%C Online
%F zhang-etal-2020-long
%X Many document-level neural machine translation (NMT) systems have explored the utility of context-aware architecture, usually requiring an increasing number of parameters and computational complexity. However, few attention is paid to the baseline model. In this paper, we research extensively the pros and cons of the standard transformer in document-level translation, and find that the auto-regressive property can simultaneously bring both the advantage of the consistency and the disadvantage of error accumulation. Therefore, we propose a surprisingly simple long-short term masking self-attention on top of the standard transformer to both effectively capture the long-range dependence and reduce the propagation of errors. We examine our approach on the two publicly available document-level datasets. We can achieve a strong result in BLEU and capture discourse phenomena.
%R 10.18653/v1/2020.emnlp-main.81
%U https://aclanthology.org/2020.emnlp-main.81
%U https://doi.org/10.18653/v1/2020.emnlp-main.81
%P 1081-1087
Markdown (Informal)
[Long-Short Term Masking Transformer: A Simple but Effective Baseline for Document-level Neural Machine Translation](https://aclanthology.org/2020.emnlp-main.81) (Zhang et al., EMNLP 2020)
ACL