@inproceedings{ahmed-etal-2024-bangla,
title = "The {B}angla/{B}engali Seed Dataset Submission to the {WMT}24 Open Language Data Initiative Shared Task",
author = "Ahmed, Firoz and
Venkateswaran, Nitin and
Moeller, Sarah",
editor = "Haddow, Barry and
Kocmi, Tom and
Koehn, Philipp and
Monz, Christof",
booktitle = "Proceedings of the Ninth Conference on Machine Translation",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/fix-sig-urls/2024.wmt-1.42/",
doi = "10.18653/v1/2024.wmt-1.42",
pages = "556--566",
abstract = "We contribute a seed dataset for the Bangla/Bengali language as part of the WMT24 Open Language Data Initiative shared task. We validate the quality of the dataset against a mined and automatically aligned dataset (NLLBv1) and two other existing datasets of crowdsourced manual translations. The validation is performed by investigating the performance of state-of-the-art translation models fine-tuned on the different datasets after controlling for training set size. Machine translation models fine-tuned on our dataset outperform models tuned on the other datasets in both translation directions (English-Bangla and Bangla-English). These results confirm the quality of our dataset. We hope our dataset will support machine translation for the Bangla/Bengali community and related low-resource languages."
}
Markdown (Informal)
[The Bangla/Bengali Seed Dataset Submission to the WMT24 Open Language Data Initiative Shared Task](https://preview.aclanthology.org/fix-sig-urls/2024.wmt-1.42/) (Ahmed et al., WMT 2024)
ACL