@inproceedings{varis-etal-2026-tokcollate,
title = "{T}ok{C}ollate: A Comprehensive Tool for Tokenizer Evaluation and Visualization across Languages",
author = "Vari{\v{s}}, Du{\v{s}}an and
Stephen, Abishek and
Libovick{\'y}, Jind{\v{r}}ich",
editor = "Durrett, Greg and
Jian, Ping",
booktitle = "Proceedings of the 64th Annual Meeting of the {A}ssociation for {C}omputational {L}inguistics (Volume 3: System Demonstrations)",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl/2026.acl-demo.41/",
pages = "418--427",
ISBN = "979-8-89176-392-0",
abstract = "Tokenization quality varies significantly across languages, contributing to disparities in LLM performance and cost for speakers of less-resourced languages {--} a phenomenon known as the ``token premium'' problem. Despite growing research interest, no existing tool provides a comprehensive intrinsic evaluation of tokenizers paired with interactive visualization. We present TokCollate (pronounced similarly to chocolate), a Python-based evaluation framework combined with a JavaScript visualization interface that addresses this gap. TokCollate implements a wide range of intrinsic metrics, including monolingual measures such as average token length and R{\'e}nyi/Shannon efficiency, and cross-lingual measures such as vocabulary overlap, Jensen-Shannon divergence, alignment-based Eflomal scores, and length ratios. It further enables analysis across language groups defined by genealogical families, scripts, geographic regions, speaker populations, and estimated data availability. TokCollate is open-source under the MIT license and available on GitHub."
}Markdown (Informal)
[TokCollate: A Comprehensive Tool for Tokenizer Evaluation and Visualization across Languages](https://preview.aclanthology.org/ingest-acl/2026.acl-demo.41/) (Variš et al., ACL 2026)
ACL