@inproceedings{steingrimsson-etal-2023-filtering,
title = "Filtering Matters: Experiments in Filtering Training Sets for Machine Translation",
author = "Steingr{\'i}msson, Stein{\th}{\'o}r and
Loftsson, Hrafn and
Way, Andy",
editor = {Alum{\"a}e, Tanel and
Fishel, Mark},
booktitle = "Proceedings of the 24th Nordic Conference on Computational Linguistics (NoDaLiDa)",
month = may,
year = "2023",
address = "T{\'o}rshavn, Faroe Islands",
publisher = "University of Tartu Library",
url = "https://preview.aclanthology.org/fix-sig-urls/2023.nodalida-1.58/",
pages = "588--600",
abstract = "We explore different approaches for filtering parallel data for MT training, whether the same filtering approaches suit different datasets, and if separate filters should be applied to a dataset depending on the translation direction. We evaluate the results of different approaches, both manually and on a downstream NMT task. We find that, first, it is beneficial to inspect how well different filtering approaches suit different datasets and, second, that while MT systems trained on data prepared using different filters do not differ substantially in quality, there is indeed a statistically significant difference. Finally, we find that the same training sets do not seem to suit different translation directions."
}
Markdown (Informal)
[Filtering Matters: Experiments in Filtering Training Sets for Machine Translation](https://preview.aclanthology.org/fix-sig-urls/2023.nodalida-1.58/) (Steingrímsson et al., NoDaLiDa 2023)
ACL