@inproceedings{koerner-koehn-2020-dual,
title = "Dual Conditional Cross Entropy Scores and {LASER} Similarity Scores for the {WMT}20 Parallel Corpus Filtering Shared Task",
author = "Koerner, Felicia and
Koehn, Philipp",
booktitle = "Proceedings of the Fifth Conference on Machine Translation",
month = nov,
year = "2020",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2020.wmt-1.109",
pages = "966--971",
abstract = "This paper describes our submission to the WMT20 Parallel Corpus Filtering and Alignment for Low-Resource Conditions Shared Task. This year{'}s corpora are noisy Khmer-English and Pashto-English, with 58.3 million and 11.6 million words respectively (English token count). Our submission focuses on filtering Pashto-English, building on previously successful methods to produce two sets of scores: LASER{\_}LM, a combination of the LASER similarity scores provided in the shared task and perplexity scores from language models, and DCCEF{\_}DUP, dual conditional cross entropy scores combined with a duplication penalty. We improve slightly on the LASER similarity score and find that the provided clean data can successfully be supplemented with a subsampled set of the noisy data, effectively increasing the training data for the models used for dual conditional cross entropy scoring.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="koerner-koehn-2020-dual">
<titleInfo>
<title>Dual Conditional Cross Entropy Scores and LASER Similarity Scores for the WMT20 Parallel Corpus Filtering Shared Task</title>
</titleInfo>
<name type="personal">
<namePart type="given">Felicia</namePart>
<namePart type="family">Koerner</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Philipp</namePart>
<namePart type="family">Koehn</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-nov</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Fifth Conference on Machine Translation</title>
</titleInfo>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>This paper describes our submission to the WMT20 Parallel Corpus Filtering and Alignment for Low-Resource Conditions Shared Task. This year’s corpora are noisy Khmer-English and Pashto-English, with 58.3 million and 11.6 million words respectively (English token count). Our submission focuses on filtering Pashto-English, building on previously successful methods to produce two sets of scores: LASER_LM, a combination of the LASER similarity scores provided in the shared task and perplexity scores from language models, and DCCEF_DUP, dual conditional cross entropy scores combined with a duplication penalty. We improve slightly on the LASER similarity score and find that the provided clean data can successfully be supplemented with a subsampled set of the noisy data, effectively increasing the training data for the models used for dual conditional cross entropy scoring.</abstract>
<identifier type="citekey">koerner-koehn-2020-dual</identifier>
<location>
<url>https://aclanthology.org/2020.wmt-1.109</url>
</location>
<part>
<date>2020-nov</date>
<extent unit="page">
<start>966</start>
<end>971</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Dual Conditional Cross Entropy Scores and LASER Similarity Scores for the WMT20 Parallel Corpus Filtering Shared Task
%A Koerner, Felicia
%A Koehn, Philipp
%S Proceedings of the Fifth Conference on Machine Translation
%D 2020
%8 nov
%I Association for Computational Linguistics
%C Online
%F koerner-koehn-2020-dual
%X This paper describes our submission to the WMT20 Parallel Corpus Filtering and Alignment for Low-Resource Conditions Shared Task. This year’s corpora are noisy Khmer-English and Pashto-English, with 58.3 million and 11.6 million words respectively (English token count). Our submission focuses on filtering Pashto-English, building on previously successful methods to produce two sets of scores: LASER_LM, a combination of the LASER similarity scores provided in the shared task and perplexity scores from language models, and DCCEF_DUP, dual conditional cross entropy scores combined with a duplication penalty. We improve slightly on the LASER similarity score and find that the provided clean data can successfully be supplemented with a subsampled set of the noisy data, effectively increasing the training data for the models used for dual conditional cross entropy scoring.
%U https://aclanthology.org/2020.wmt-1.109
%P 966-971
Markdown (Informal)
[Dual Conditional Cross Entropy Scores and LASER Similarity Scores for the WMT20 Parallel Corpus Filtering Shared Task](https://aclanthology.org/2020.wmt-1.109) (Koerner & Koehn, WMT 2020)
ACL