@inproceedings{singh-etal-2025-evaluating,
title = "Evaluating {I}ndic{T}rans2 and {B}y{T}5 for {E}nglish{--}{S}antali Machine Translation Using the Ol Chiki Script",
author = "Singh, Kshetrimayum Boynao and
Ekbal, Asif and
Pakray, Partha",
editor = "Shukla, Ankita and
Kumar, Sandeep and
Bedi, Amrit Singh and
Chakraborty, Tanmoy",
booktitle = "Proceedings of the 1st Workshop on Multimodal Models for Low-Resource Contexts and Social Impact (MMLoSo 2025)",
month = dec,
year = "2025",
address = "Mumbai, India",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-ijcnlp-aacl/2025.mmloso-1.9/",
pages = "95--100",
ISBN = "979-8-89176-311-1",
abstract = "In this study, we examine and evaluate two multilingual NMT models, IndicTrans2 and ByT5, for English-Santali bidirectional translation using the Ol Chiki script. The models are trained on the MMLoSo Shared Task dataset, supplemented with public English-Santali resources, and evaluated on the AI4Bharat IN22 and Flores test sets, specifically IN22-Gen and Flores200-dev. IndicTrans2 finetune strongly outperforms ByT5 across both directions. On IN22-Gen, it achieves 26.8 BLEU and 53.9 chrF++ for Santali{\textrightarrow}English and 7.3 BLEU and 40.3 chrF++ for English{\textrightarrow}Santali, compared to ByT5{'}s 5.6 BLEU and 30.2 chrF++ for Santali{\textrightarrow}English and 2.9 BLEU and 32.6 chrF++ for English{\textrightarrow}Santali. On the Flores test set, IndicTrans2 finetune achieves 22 BLEU, 49.2 chrF++, and 4.7 BLEU, 32.7 chrF++. Again, it surpasses ByT5. While ByT5{'}s bytelevel modelling is script-agnostic, it struggles with Santali morphology. IndicTrans2 benefits from multilingual pre-training and script unification."
}Markdown (Informal)
[Evaluating IndicTrans2 and ByT5 for English–Santali Machine Translation Using the Ol Chiki Script](https://preview.aclanthology.org/ingest-ijcnlp-aacl/2025.mmloso-1.9/) (Singh et al., MMLoSo 2025)
ACL