@article{dubey-etal-2026-maith,
title = "{M}ait{H} 1.0: A Parallel Corpus and Baseline for Low-Resource {M}aithili-{H}indi Translation",
author = "Dubey, Kamanksha Prasad and
Maurya, Chandresh and
Padmanabh, Kumar",
editor = "Piperidis, Stelios and
Bel, N{\'u}ria and
van den Heuvel, Henk and
Ide, Nancy and
Krek, Simon and
Toral, Antonio",
journal = "International Conference on Language Resources and Evaluation",
volume = "main",
month = may,
year = "2026",
address = "Palma de Mallorca, Spain",
publisher = "ELRA Language Resource Association",
url = "https://preview.aclanthology.org/ingest-lrec/2026.lrec-main.676/",
pages = "8567--8576",
abstract = "Maithili is one of the 22 official languages recognized in the Indian Constitution. The literature of Maithili is rich; however, due to current socio-political changes, the language is on the verge of extinction. Therefore, it is crucial to develop a corpus for low-resource Indic languages like Maithili to ensure that the dream of ``No Language Left Behind'' (NLLB) is realized. With this in mind, we contribute a corpus (1,05,600 sentences) containing both manually curated and synthetically generated. Additionally, we propose a strong baseline on the Maithali-Hindi pair using multilingual pretrained models such as IndicTrans2, mBART50, mT5, and NLLB-200 distilled. We evaluate the translation systems using standard performance metrics, including BLEU, CHRF2, TER, COMET, METEOR, and BERTScore. Comparative experiments conducted against the existing NLLB dataset (5,50,300 sentence pairs) demonstrate that our proposed dataset consistently yields superior translation quality. Finally, these results demonstrate that, even with a smaller corpus size, high-quality, task-specific data significantly enhance translation accuracy for low-resource Indian languages, such as Maithili."
}