@inproceedings{edman-etal-2020-data,
title = "Data Selection for Unsupervised Translation of {G}erman{--}{U}pper {S}orbian",
author = "Edman, Lukas and
Toral, Antonio and
van Noord, Gertjan",
booktitle = "Proceedings of the Fifth Conference on Machine Translation",
month = nov,
year = "2020",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2020.wmt-1.130",
pages = "1099--1103",
abstract = "This paper describes the methods behind the systems submitted by the University of Groningen for the WMT 2020 Unsupervised Machine Translation task for German{--}Upper Sorbian. We investigate the usefulness of data selection in the unsupervised setting. We find that we can perform data selection using a pretrained model and show that the quality of a set of sentences or documents can have a great impact on the performance of the UNMT system trained on it. Furthermore, we show that document-level data selection should be preferred for training the XLM model when possible. Finally, we show that there is a trade-off between quality and quantity of the data used to train UNMT systems.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="edman-etal-2020-data">
<titleInfo>
<title>Data Selection for Unsupervised Translation of German–Upper Sorbian</title>
</titleInfo>
<name type="personal">
<namePart type="given">Lukas</namePart>
<namePart type="family">Edman</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Antonio</namePart>
<namePart type="family">Toral</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gertjan</namePart>
<namePart type="family">van Noord</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-nov</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Fifth Conference on Machine Translation</title>
</titleInfo>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>This paper describes the methods behind the systems submitted by the University of Groningen for the WMT 2020 Unsupervised Machine Translation task for German–Upper Sorbian. We investigate the usefulness of data selection in the unsupervised setting. We find that we can perform data selection using a pretrained model and show that the quality of a set of sentences or documents can have a great impact on the performance of the UNMT system trained on it. Furthermore, we show that document-level data selection should be preferred for training the XLM model when possible. Finally, we show that there is a trade-off between quality and quantity of the data used to train UNMT systems.</abstract>
<identifier type="citekey">edman-etal-2020-data</identifier>
<location>
<url>https://aclanthology.org/2020.wmt-1.130</url>
</location>
<part>
<date>2020-nov</date>
<extent unit="page">
<start>1099</start>
<end>1103</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Data Selection for Unsupervised Translation of German–Upper Sorbian
%A Edman, Lukas
%A Toral, Antonio
%A van Noord, Gertjan
%S Proceedings of the Fifth Conference on Machine Translation
%D 2020
%8 nov
%I Association for Computational Linguistics
%C Online
%F edman-etal-2020-data
%X This paper describes the methods behind the systems submitted by the University of Groningen for the WMT 2020 Unsupervised Machine Translation task for German–Upper Sorbian. We investigate the usefulness of data selection in the unsupervised setting. We find that we can perform data selection using a pretrained model and show that the quality of a set of sentences or documents can have a great impact on the performance of the UNMT system trained on it. Furthermore, we show that document-level data selection should be preferred for training the XLM model when possible. Finally, we show that there is a trade-off between quality and quantity of the data used to train UNMT systems.
%U https://aclanthology.org/2020.wmt-1.130
%P 1099-1103
Markdown (Informal)
[Data Selection for Unsupervised Translation of German–Upper Sorbian](https://aclanthology.org/2020.wmt-1.130) (Edman et al., WMT 2020)
ACL