@inproceedings{imamura-utiyama-2024-empirical,
title = "An Empirical Study of Multilingual Vocabulary for Neural Machine Translation Models",
author = "Imamura, Kenji and
Utiyama, Masao",
editor = "Nakazawa, Toshiaki and
Goto, Isao",
booktitle = "Proceedings of the Eleventh Workshop on Asian Translation (WAT 2024)",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/fix-sig-urls/2024.wat-1.2/",
doi = "10.18653/v1/2024.wat-1.2",
pages = "22--35",
abstract = "In this paper, we discuss multilingual vocabulary for neural machine translation models. Multilingual vocabularies should generate highly accurate machine translations regardless of the languages, and have preferences so that tokenized strings contain rare out-of-vocabulary (OOV) tokens and token sequences are short. In this paper, we discuss the characteristics of various multilingual vocabularies via tokenization and translation experiments. We also present our recommended vocabulary and tokenizer."
}
Markdown (Informal)
[An Empirical Study of Multilingual Vocabulary for Neural Machine Translation Models](https://preview.aclanthology.org/fix-sig-urls/2024.wat-1.2/) (Imamura & Utiyama, WAT 2024)
ACL