@inproceedings{espla-gomis-etal-2020-bicleaner,
title = "Bicleaner at {WMT} 2020: {U}niversitat d{'}Alacant-Prompsit{'}s submission to the parallel corpus filtering shared task",
author = "Espl{\`a}-Gomis, Miquel and
S{\'a}nchez-Cartagena, V{\'\i}ctor M. and
Zaragoza-Bernabeu, Jaume and
S{\'a}nchez-Mart{\'\i}nez, Felipe",
booktitle = "Proceedings of the Fifth Conference on Machine Translation",
month = nov,
year = "2020",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2020.wmt-1.107",
pages = "952--958",
abstract = "This paper describes the joint submission of Universitat d{'}Alacant and Prompsit Language Engineering to the WMT 2020 shared task on parallel corpus filtering. Our submission, based on the free/open-source tool Bicleaner, enhances it with Extremely Randomised Trees and lexical similarity features that account for the frequency of the words in the parallel sentences to determine if two sentences are parallel. To train this classifier we used the clean corpora provided for the task and synthetic noisy parallel sentences. In addition we re-score the output of Bicleaner using character-level language models and n-gram saturation.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="espla-gomis-etal-2020-bicleaner">
<titleInfo>
<title>Bicleaner at WMT 2020: Universitat d’Alacant-Prompsit’s submission to the parallel corpus filtering shared task</title>
</titleInfo>
<name type="personal">
<namePart type="given">Miquel</namePart>
<namePart type="family">Esplà-Gomis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Víctor</namePart>
<namePart type="given">M</namePart>
<namePart type="family">Sánchez-Cartagena</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jaume</namePart>
<namePart type="family">Zaragoza-Bernabeu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Felipe</namePart>
<namePart type="family">Sánchez-Martínez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-nov</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Fifth Conference on Machine Translation</title>
</titleInfo>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>This paper describes the joint submission of Universitat d’Alacant and Prompsit Language Engineering to the WMT 2020 shared task on parallel corpus filtering. Our submission, based on the free/open-source tool Bicleaner, enhances it with Extremely Randomised Trees and lexical similarity features that account for the frequency of the words in the parallel sentences to determine if two sentences are parallel. To train this classifier we used the clean corpora provided for the task and synthetic noisy parallel sentences. In addition we re-score the output of Bicleaner using character-level language models and n-gram saturation.</abstract>
<identifier type="citekey">espla-gomis-etal-2020-bicleaner</identifier>
<location>
<url>https://aclanthology.org/2020.wmt-1.107</url>
</location>
<part>
<date>2020-nov</date>
<extent unit="page">
<start>952</start>
<end>958</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Bicleaner at WMT 2020: Universitat d’Alacant-Prompsit’s submission to the parallel corpus filtering shared task
%A Esplà-Gomis, Miquel
%A Sánchez-Cartagena, Víctor M.
%A Zaragoza-Bernabeu, Jaume
%A Sánchez-Martínez, Felipe
%S Proceedings of the Fifth Conference on Machine Translation
%D 2020
%8 nov
%I Association for Computational Linguistics
%C Online
%F espla-gomis-etal-2020-bicleaner
%X This paper describes the joint submission of Universitat d’Alacant and Prompsit Language Engineering to the WMT 2020 shared task on parallel corpus filtering. Our submission, based on the free/open-source tool Bicleaner, enhances it with Extremely Randomised Trees and lexical similarity features that account for the frequency of the words in the parallel sentences to determine if two sentences are parallel. To train this classifier we used the clean corpora provided for the task and synthetic noisy parallel sentences. In addition we re-score the output of Bicleaner using character-level language models and n-gram saturation.
%U https://aclanthology.org/2020.wmt-1.107
%P 952-958
Markdown (Informal)
[Bicleaner at WMT 2020: Universitat d’Alacant-Prompsit’s submission to the parallel corpus filtering shared task](https://aclanthology.org/2020.wmt-1.107) (Esplà-Gomis et al., WMT 2020)
ACL