@inproceedings{marmonier-etal-2025-french,
title = "A {F}rench Version of the {OLDI} Seed Corpus",
author = "Marmonier, Malik and
Sagot, Beno{\^i}t and
Bawden, Rachel",
editor = "Haddow, Barry and
Kocmi, Tom and
Koehn, Philipp and
Monz, Christof",
booktitle = "Proceedings of the Tenth Conference on Machine Translation",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-emnlp/2025.wmt-1.80/",
pages = "1048--1060",
ISBN = "979-8-89176-341-8",
abstract = "We present the first French partition of the OLDI Seed Corpus, our submission to the WMT 2025 Open Language Data Initiative (OLDI) shared task. We detail its creation process, which involved using multiple machine translation systems and a custom-built interface for post-editing by qualified native speakers. We also highlight the unique translation challenges presented by the source data, which combines highly technical, encyclopedic terminology with the stylistic irregularities characteristic of user-generated content taken from Wikipedia. This French corpus is not an end in itself, but is intended as a crucial pivot resource to facilitate the collection of parallel corpora for the under-resourced regional languages of France."
}Markdown (Informal)
[A French Version of the OLDI Seed Corpus](https://preview.aclanthology.org/ingest-emnlp/2025.wmt-1.80/) (Marmonier et al., WMT 2025)
ACL
- Malik Marmonier, Benoît Sagot, and Rachel Bawden. 2025. A French Version of the OLDI Seed Corpus. In Proceedings of the Tenth Conference on Machine Translation, pages 1048–1060, Suzhou, China. Association for Computational Linguistics.