@inproceedings{velayuthan-sarveswaran-2025-egalitarian,
title = "Egalitarian Language Representation in Language Models: It All Begins with Tokenizers",
author = "Velayuthan, Menan and
Sarveswaran, Kengatharaiyer",
editor = "Rambow, Owen and
Wanner, Leo and
Apidianaki, Marianna and
Al-Khalifa, Hend and
Eugenio, Barbara Di and
Schockaert, Steven",
booktitle = "Proceedings of the 31st International Conference on Computational Linguistics",
month = jan,
year = "2025",
address = "Abu Dhabi, UAE",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/fix-sig-urls/2025.coling-main.400/",
pages = "5987--5996",
abstract = "Tokenizers act as a bridge between human language and the latent space of language models, influencing how language is represented in these models. Despite the dominance of English-Centric (EC) Large Language Models (LLMs), tokenization methods often fail to fairly represent complex scripts like Tamil, Sinhala, and Hindi, primarily due to pre-tokenization choices. This study demonstrates that pre-tokenization has a more significant impact than tokenization algorithms on achieving egalitarian representation. To address this, we introduce an improvement to the Byte Pair Encoding (BPE) algorithm by incorporating graphemes, which we term Grapheme Pair Encoding (GPE). Our experiments show that grapheme-based character extraction outperforms byte-level tokenizers for complex scripts. We validate this approach through experiments on Tamil, Sinhala, and Hindi. The codebase and resources used in this work are publicly available at https://github.com/vmenan/tokenizers-coling2025."
}
Markdown (Informal)
[Egalitarian Language Representation in Language Models: It All Begins with Tokenizers](https://preview.aclanthology.org/fix-sig-urls/2025.coling-main.400/) (Velayuthan & Sarveswaran, COLING 2025)
ACL