@inproceedings{wang-etal-2018-nicts,
title = "{NICT}{'}s Corpus Filtering Systems for the {WMT}18 Parallel Corpus Filtering Task",
author = "Wang, Rui and
Marie, Benjamin and
Utiyama, Masao and
Sumita, Eiichiro",
booktitle = "Proceedings of the Third Conference on Machine Translation: Shared Task Papers",
month = oct,
year = "2018",
address = "Belgium, Brussels",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/W18-6489",
doi = "10.18653/v1/W18-6489",
pages = "963--967",
abstract = "This paper presents the NICT{'}s participation in the WMT18 shared parallel corpus filtering task. The organizers provided 1 billion words German-English corpus crawled from the web as part of the Paracrawl project. This corpus is too noisy to build an acceptable neural machine translation (NMT) system. Using the clean data of the WMT18 shared news translation task, we designed several features and trained a classifier to score each sentence pairs in the noisy data. Finally, we sampled 100 million and 10 million words and built corresponding NMT systems. Empirical results show that our NMT systems trained on sampled data achieve promising performance.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="wang-etal-2018-nicts">
<titleInfo>
<title>NICT’s Corpus Filtering Systems for the WMT18 Parallel Corpus Filtering Task</title>
</titleInfo>
<name type="personal">
<namePart type="given">Rui</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Benjamin</namePart>
<namePart type="family">Marie</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Masao</namePart>
<namePart type="family">Utiyama</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Eiichiro</namePart>
<namePart type="family">Sumita</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2018-oct</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Third Conference on Machine Translation: Shared Task Papers</title>
</titleInfo>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Belgium, Brussels</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>This paper presents the NICT’s participation in the WMT18 shared parallel corpus filtering task. The organizers provided 1 billion words German-English corpus crawled from the web as part of the Paracrawl project. This corpus is too noisy to build an acceptable neural machine translation (NMT) system. Using the clean data of the WMT18 shared news translation task, we designed several features and trained a classifier to score each sentence pairs in the noisy data. Finally, we sampled 100 million and 10 million words and built corresponding NMT systems. Empirical results show that our NMT systems trained on sampled data achieve promising performance.</abstract>
<identifier type="citekey">wang-etal-2018-nicts</identifier>
<identifier type="doi">10.18653/v1/W18-6489</identifier>
<location>
<url>https://aclanthology.org/W18-6489</url>
</location>
<part>
<date>2018-oct</date>
<extent unit="page">
<start>963</start>
<end>967</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T NICT’s Corpus Filtering Systems for the WMT18 Parallel Corpus Filtering Task
%A Wang, Rui
%A Marie, Benjamin
%A Utiyama, Masao
%A Sumita, Eiichiro
%S Proceedings of the Third Conference on Machine Translation: Shared Task Papers
%D 2018
%8 oct
%I Association for Computational Linguistics
%C Belgium, Brussels
%F wang-etal-2018-nicts
%X This paper presents the NICT’s participation in the WMT18 shared parallel corpus filtering task. The organizers provided 1 billion words German-English corpus crawled from the web as part of the Paracrawl project. This corpus is too noisy to build an acceptable neural machine translation (NMT) system. Using the clean data of the WMT18 shared news translation task, we designed several features and trained a classifier to score each sentence pairs in the noisy data. Finally, we sampled 100 million and 10 million words and built corresponding NMT systems. Empirical results show that our NMT systems trained on sampled data achieve promising performance.
%R 10.18653/v1/W18-6489
%U https://aclanthology.org/W18-6489
%U https://doi.org/10.18653/v1/W18-6489
%P 963-967
Markdown (Informal)
[NICT’s Corpus Filtering Systems for the WMT18 Parallel Corpus Filtering Task](https://aclanthology.org/W18-6489) (Wang et al., 2018)
ACL