@inproceedings{chowdhury-woolf-2026-benchmarking,
title = "Benchmarking Byte-Pair Encoding Tokenizers on Different Languages with Bits per Byte",
author = "Chowdhury, Soham and
Woolf, Warren",
editor = "Huang, Kaiyu and
Mo, Fengran and
Chen, Pinzhen and
Jiang, Meng",
booktitle = "Proceedings of the 1st Workshop on Multilinguality in the Era of Large Language Models ({M}e{LLM} 2026)",
month = jul,
year = "2026",
address = "San Diego, United States",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl-workshops/2026.mellm-1.27/",
pages = "275--283",
ISBN = "979-8-89176-430-9",
abstract = "Tokenization significantly affects the cross-lingual performance of language models, yet recent tokenizer variants such as SuperBPE and MorphBPE have not been systematically evaluated across typologically diverse languages. We conduct the first extrinsic cross-language comparison of BPE, SuperBPE, and MorphBPE tokenizers on English, Mandarin, and Hungarian, using bits per byte (BPB) normalized perplexity as our metric, with vocabulary sizes of 8K, 16K, and 32K. We find that SuperBPE matches BPE for English but underperforms by 0.01{--}0.06 BPB for Hungarian and Mandarin, suggesting that cross-whitespace merging is counterproductive for non-English languages. MorphBPE performs worse than BPE across all settings, with gaps of 0.02{--}0.04 BPB at the 32K vocabulary size. These results suggest that linguistic theory alone does not guarantee practical improvements in tokenizer design, and that standard BPE remains a surprisingly effective baseline across typologically diverse languages."
}Markdown (Informal)
[Benchmarking Byte-Pair Encoding Tokenizers on Different Languages with Bits per Byte](https://preview.aclanthology.org/ingest-acl-workshops/2026.mellm-1.27/) (Chowdhury & Woolf, MeLLM 2026)
ACL