@inproceedings{p-m-etal-2024-mtnlp,
title = "{MTNLP}-{IIITH}: Machine Translation for Low-Resource {I}ndic Languages",
author = "P M, Abhinav and
Shetye, Ketaki and
Krishnamurthy, Parameswari",
editor = "Haddow, Barry and
Kocmi, Tom and
Koehn, Philipp and
Monz, Christof",
booktitle = "Proceedings of the Ninth Conference on Machine Translation",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/add-emnlp-2024-awards/2024.wmt-1.65/",
doi = "10.18653/v1/2024.wmt-1.65",
pages = "751--755",
abstract = "Machine Translation for low-resource languages presents significant challenges, primarily due to limited data availability. We have a baseline model and a primary model. For the baseline model, we first fine-tune the mBART model (mbart-large-50-many-to-many-mmt) for the language pairs English-Khasi, Khasi-English, English-Manipuri, and Manipuri-English. We then augment the dataset by back-translating from Indic languages to English. To enhance data quality, we fine-tune the LaBSE model specifically for Khasi and Manipuri, generating sentence embeddings and applying a cosine similarity threshold of 0.84 to filter out low-quality back-translations. The filtered data is combined with the original training data and used to further fine-tune the mBART model, creating our primary model. The results show that the primary model slightly outperforms the baseline model, with the best performance achieved by the English-to-Khasi (en-kh) primary model, which recorded a BLEU score of 0.0492, a chrF score of 0.3316, and a METEOR score of 0.2589 (on a scale of 0 to 1), with similar results for other language pairs."
}
Markdown (Informal)
[MTNLP-IIITH: Machine Translation for Low-Resource Indic Languages](https://preview.aclanthology.org/add-emnlp-2024-awards/2024.wmt-1.65/) (P M et al., WMT 2024)
ACL