@inproceedings{yadav-shrivastava-2025-preliminary,
title = "A Preliminary Exploration of Phrase-Based {SMT} and Multi-{BPE} Segmentations through Concatenated Tokenised Corpora for Low-Resource {I}ndian Languages",
author = "Yadav, Saumitra and
Shrivastava, Manish",
editor = "Haddow, Barry and
Kocmi, Tom and
Koehn, Philipp and
Monz, Christof",
booktitle = "Proceedings of the Tenth Conference on Machine Translation",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-emnlp/2025.wmt-1.103/",
pages = "1253--1258",
ISBN = "979-8-89176-341-8",
abstract = "This paper describes our methodology and findings in building Machine Translation (MT) systems for submission to the WMT 2025 Shared Task on Low-Resource Indic Language Translation. Our primary aim was to evaluate the effectiveness of a phrase-based Statistical Machine Translation (SMT) system combined with a less common subword segmentation strategy for languages with very limited parallel data. We applied multiple Byte Pair Encoding (BPE) merge operations to the parallel corpora and concatenated the outputs to improve vocabulary coverage. We built systems for the English{--}Nyishi, English{--}Khasi, and English{--}Assamese language pairs. Although the approach showed potential as a data augmentation method, its performance in BLEU scores was not competitive with other shared task systems. This paper outlines our system architecture, data processing pipeline, and evaluation results, and provides an analysis of the challenges, positioning our work as an exploratory benchmark for future research in this area."
}Markdown (Informal)
[A Preliminary Exploration of Phrase-Based SMT and Multi-BPE Segmentations through Concatenated Tokenised Corpora for Low-Resource Indian Languages](https://preview.aclanthology.org/ingest-emnlp/2025.wmt-1.103/) (Yadav & Shrivastava, WMT 2025)
ACL