@inproceedings{vazquez-etal-2019-university,
title = "The {U}niversity of {H}elsinki Submission to the {WMT}19 Parallel Corpus Filtering Task",
author = {V{\'a}zquez, Ra{\'u}l and
Sulubacak, Umut and
Tiedemann, J{\"o}rg},
editor = "Bojar, Ond{\v{r}}ej and
Chatterjee, Rajen and
Federmann, Christian and
Fishel, Mark and
Graham, Yvette and
Haddow, Barry and
Huck, Matthias and
Yepes, Antonio Jimeno and
Koehn, Philipp and
Martins, Andr{\'e} and
Monz, Christof and
Negri, Matteo and
N{\'e}v{\'e}ol, Aur{\'e}lie and
Neves, Mariana and
Post, Matt and
Turchi, Marco and
Verspoor, Karin",
booktitle = "Proceedings of the Fourth Conference on Machine Translation (Volume 3: Shared Task Papers, Day 2)",
month = aug,
year = "2019",
address = "Florence, Italy",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/jlcl-multiple-ingestion/W19-5441/",
doi = "10.18653/v1/W19-5441",
pages = "294--300",
abstract = "This paper describes the University of Helsinki Language Technology group`s participation in the WMT 2019 parallel corpus filtering task. Our scores were produced using a two-step strategy. First, we individually applied a series of filters to remove the {\textquoteleft}bad' quality sentences. Then, we produced scores for each sentence by weighting these features with a classification model. This methodology allowed us to build a simple and reliable system that is easily adaptable to other language pairs."
}
Markdown (Informal)
[The University of Helsinki Submission to the WMT19 Parallel Corpus Filtering Task](https://preview.aclanthology.org/jlcl-multiple-ingestion/W19-5441/) (Vázquez et al., WMT 2019)
ACL