@inproceedings{freitag-etal-2019-ape,
title = "{APE} at Scale and Its Implications on {MT} Evaluation Biases",
author = "Freitag, Markus and
Caswell, Isaac and
Roy, Scott",
booktitle = "Proceedings of the Fourth Conference on Machine Translation (Volume 1: Research Papers)",
month = aug,
year = "2019",
address = "Florence, Italy",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/W19-5204",
doi = "10.18653/v1/W19-5204",
pages = "34--44",
abstract = "In this work, we train an Automatic Post-Editing (APE) model and use it to reveal biases in standard MT evaluation procedures. The goal of our APE model is to correct typical errors introduced by the translation process, and convert the {``}translationese{''} output into natural text. Our APE model is trained entirely on monolingual data that has been round-trip translated through English, to mimic errors that are similar to the ones introduced by NMT. We apply our model to the output of existing NMT systems, and demonstrate that, while the human-judged quality improves in all cases, BLEU scores drop with forward-translated test sets. We verify these results for the WMT18 English to German, WMT15 English to French, and WMT16 English to Romanian tasks. Furthermore, we selectively apply our APE model on the output of the top submissions of the most recent WMT evaluation campaigns. We see quality improvements on all tasks of up to 2.5 BLEU points.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="freitag-etal-2019-ape">
<titleInfo>
<title>APE at Scale and Its Implications on MT Evaluation Biases</title>
</titleInfo>
<name type="personal">
<namePart type="given">Markus</namePart>
<namePart type="family">Freitag</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Isaac</namePart>
<namePart type="family">Caswell</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Scott</namePart>
<namePart type="family">Roy</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2019-aug</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Fourth Conference on Machine Translation (Volume 1: Research Papers)</title>
</titleInfo>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Florence, Italy</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>In this work, we train an Automatic Post-Editing (APE) model and use it to reveal biases in standard MT evaluation procedures. The goal of our APE model is to correct typical errors introduced by the translation process, and convert the “translationese” output into natural text. Our APE model is trained entirely on monolingual data that has been round-trip translated through English, to mimic errors that are similar to the ones introduced by NMT. We apply our model to the output of existing NMT systems, and demonstrate that, while the human-judged quality improves in all cases, BLEU scores drop with forward-translated test sets. We verify these results for the WMT18 English to German, WMT15 English to French, and WMT16 English to Romanian tasks. Furthermore, we selectively apply our APE model on the output of the top submissions of the most recent WMT evaluation campaigns. We see quality improvements on all tasks of up to 2.5 BLEU points.</abstract>
<identifier type="citekey">freitag-etal-2019-ape</identifier>
<identifier type="doi">10.18653/v1/W19-5204</identifier>
<location>
<url>https://aclanthology.org/W19-5204</url>
</location>
<part>
<date>2019-aug</date>
<extent unit="page">
<start>34</start>
<end>44</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T APE at Scale and Its Implications on MT Evaluation Biases
%A Freitag, Markus
%A Caswell, Isaac
%A Roy, Scott
%S Proceedings of the Fourth Conference on Machine Translation (Volume 1: Research Papers)
%D 2019
%8 aug
%I Association for Computational Linguistics
%C Florence, Italy
%F freitag-etal-2019-ape
%X In this work, we train an Automatic Post-Editing (APE) model and use it to reveal biases in standard MT evaluation procedures. The goal of our APE model is to correct typical errors introduced by the translation process, and convert the “translationese” output into natural text. Our APE model is trained entirely on monolingual data that has been round-trip translated through English, to mimic errors that are similar to the ones introduced by NMT. We apply our model to the output of existing NMT systems, and demonstrate that, while the human-judged quality improves in all cases, BLEU scores drop with forward-translated test sets. We verify these results for the WMT18 English to German, WMT15 English to French, and WMT16 English to Romanian tasks. Furthermore, we selectively apply our APE model on the output of the top submissions of the most recent WMT evaluation campaigns. We see quality improvements on all tasks of up to 2.5 BLEU points.
%R 10.18653/v1/W19-5204
%U https://aclanthology.org/W19-5204
%U https://doi.org/10.18653/v1/W19-5204
%P 34-44
Markdown (Informal)
[APE at Scale and Its Implications on MT Evaluation Biases](https://aclanthology.org/W19-5204) (Freitag et al., 2019)
ACL