@inproceedings{jamil-etal-2026-token,
title = "Token Cost Inequality: Measuring Tokenization Disparities Across Scripts in {R}oman {U}rdu and {U}rdu",
author = "Jamil, Waleed and
Rafi, Saima and
Yu, Yanchao",
editor = "Mille, Simon and
Gehrmann, Sebastian and
Schmidtov{\'a}, Patr{\'i}cia and
Du{\v{s}}ek, Ond{\v{r}}ej and
Fadaee, Marzieh and
Lo, Kyle and
Santus, Enrico and
Stanovsky, Gabriel",
booktitle = "Proceedings of the Fifth Workshop on Generation, Evaluation and Metrics ({GEM})",
month = jul,
year = "2026",
address = "San Diego, California, USA",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl-workshops/2026.gem-main.54/",
pages = "563--573",
ISBN = "979-8-89176-423-1",
abstract = "Tokenization is central to modern language models, yet its effects on cross-script efficiency, input cost, and truncation behavior remain underexplored. We study this issue through aligned comparisons of Urdu and Roman Urdu, asking whether semantically equivalent content incurs systematically different tokenization costs across scripts. We introduce Token Cost Inequality (TCI), a metric for quantifying relative tokenization efficiency under semantic alignment, and propose a multi-axis framework spanning token cost, fragmentation, and fixed-budget retention. Across three tokenizer families (cl100k, mT5, and ByT5), we find that tokenization disparities are strongly tokenizer-dependent, with substantial differences in token cost and segmentation behavior across scripts. We further identify an efficiency-retention paradox: token cost alone does not fully explain truncation behavior. Under fixed token budgets, Roman Urdu preserves more character-level content than native Urdu, reflecting differences in character-per-token density and fragmentation. Lightweight normalization yields minimal gains, suggesting that the observed disparities arise primarily from tokenizer design rather than superficial orthographic variation. These findings provide controlled evidence that fixed token budgets can produce unequal surface-coverage conditions across scripts, with implications for input-side cost estimation, benchmark design, and multilingual evaluation under constrained token budgets."
}