@inproceedings{korotkova-fishel-2024-estonian,
title = "{E}stonian-Centric Machine Translation: Data, Models, and Challenges",
author = "Korotkova, Elizaveta and
Fishel, Mark",
editor = "Scarton, Carolina and
Prescott, Charlotte and
Bayliss, Chris and
Oakley, Chris and
Wright, Joanna and
Wrigley, Stuart and
Song, Xingyi and
Gow-Smith, Edward and
Bawden, Rachel and
S{\'a}nchez-Cartagena, V{\'i}ctor M and
Cadwell, Patrick and
Lapshinova-Koltunski, Ekaterina and
Cabarr{\~a}o, Vera and
Chatzitheodorou, Konstantinos and
Nurminen, Mary and
Kanojia, Diptesh and
Moniz, Helena",
booktitle = "Proceedings of the 25th Annual Conference of the European Association for Machine Translation (Volume 1)",
month = jun,
year = "2024",
address = "Sheffield, UK",
publisher = "European Association for Machine Translation (EAMT)",
url = "https://preview.aclanthology.org/fix-sig-urls/2024.eamt-1.55/",
pages = "647--660",
abstract = "Machine translation (MT) research is most typically English-centric. In recent years, massively multilingual translation systems have also been increasingly popular. However, efforts purposefully focused on less-resourced languages are less widespread. In this paper, we focus on MT from and into the Estonian language. First, emphasizing the importance of data availability, we generate and publicly release a back-translation corpus of over 2 billion sentence pairs. Second, using these novel data, we create MT models covering 18 translation directions, all either from or into Estonian. We re-use the encoder of the NLLB multilingual model and train modular decoders separately for each language, surpassing the original NLLB quality. Our resulting MT models largely outperform other open-source MT systems, including previous Estonian-focused efforts, and are released as part of this submission."
}
Markdown (Informal)
[Estonian-Centric Machine Translation: Data, Models, and Challenges](https://preview.aclanthology.org/fix-sig-urls/2024.eamt-1.55/) (Korotkova & Fishel, EAMT 2024)
ACL