@inproceedings{koehn-2024-neural,
title = "Neural Methods for Aligning Large-Scale Parallel Corpora from the Web for South and {E}ast {A}sian Languages",
author = "Koehn, Philipp",
editor = "Haddow, Barry and
Kocmi, Tom and
Koehn, Philipp and
Monz, Christof",
booktitle = "Proceedings of the Ninth Conference on Machine Translation",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/fix-sig-urls/2024.wmt-1.132/",
doi = "10.18653/v1/2024.wmt-1.132",
pages = "1454--1466",
abstract = "We introduce neural methods and a toxicity filtering step to the hierarchical web mining approach of Paracrawl (Ba{\~n}{\'o}n et al., 2020), showing large improvements. We apply these methods to web-scale parallel corpus mining for 9 South and East Asian national languages, creating training resources for machine translation that yield better translation quality for most of these languages than existing publicly available datasets in OPUS. Our methods also generally lead to better results than the global mining approach of Schwenk et al. (2021)."
}
Markdown (Informal)
[Neural Methods for Aligning Large-Scale Parallel Corpora from the Web for South and East Asian Languages](https://preview.aclanthology.org/fix-sig-urls/2024.wmt-1.132/) (Koehn, WMT 2024)
ACL