@inproceedings{adlaon-marcos-2024-finding,
title = "Finding the Optimal Byte-Pair Encoding Merge Operations for Neural Machine Translation in a Low-Resource Setting",
author = "Adlaon, Kristine Mae M. and
Marcos, Nelson",
editor = "Al-Onaizan, Yaser and
Bansal, Mohit and
Chen, Yun-Nung",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2024",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/add-emnlp-2024-awards/2024.findings-emnlp.860/",
doi = "10.18653/v1/2024.findings-emnlp.860",
pages = "14673--14682",
abstract = "This paper investigates the impact of different Byte Pair Encoding (BPE) configurations, specifically, merge operations on neural machine translation (NMT) performance for the Filipino-Cebuano language pair across various text domains. Results demonstrate that smaller BPE configurations, notably 2k, 5k, and 8k consistently yield higher BLEU scores, indicating improved translation quality through finer tokenization granularity. Conversely, larger BPE configurations and the absence of BPE result in lower BLEU scores, suggesting a decline in translation quality due to coarser tokenization. Additionally, these findings help us understand how the size of the model and how finely we break down words affect the quality of translations. This knowledge will be useful for improving translation systems, especially for languages that don`t have many parallel texts available for training."
}
Markdown (Informal)
[Finding the Optimal Byte-Pair Encoding Merge Operations for Neural Machine Translation in a Low-Resource Setting](https://preview.aclanthology.org/add-emnlp-2024-awards/2024.findings-emnlp.860/) (Adlaon & Marcos, Findings 2024)
ACL