@inproceedings{dwivedi-gopalan-2026-comparative,
title = "Comparative Analysis of the Intrinsic Metrics for Tokenizers and their effect on Downstream Tasks for {H}indi and {M}arathi",
author = "Dwivedi, Shagun and
Gopalan, Kaushik",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Proceedings of the 64th Annual Meeting of the {A}ssociation for {C}omputational {L}inguistics (Volume 1: Long Papers)",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl/2026.acl-long.1037/",
pages = "22652--22663",
ISBN = "979-8-89176-390-6",
abstract = "Various studies have pointed out that the performance of language models is poor in non-English or non-European languages. One of the factors affecting this performance is the effectiveness and suitability of the tokenization scheme used in the model. Indic scripts require multiple Unicode codepoints to represent a single visual unit to be encoded in the standard UTF-8 scheme. This paper investigates the effect of multiple tokenizers that use UTF-8 text input on the downstream performance of pretrained language models for Hindi and Marathi, languages written in $\textit{Devanāgari}$ script. We present the intrinsic performance of the tokenizers using Fertility, R{\'e}nyi Efficiency and Percentile Frequency, and report the extrinsic performance of monolingual and multilingual models on question-answering tasks, using an automated parts-of-speech and sentence similarity based evaluation framework, and on word-level tasks such as grapheme-to-phoneme conversion and transliteration. We propose a grapheme cluster tokenizer for the script which shows performance better than or competitive with other popular tokenizers. We also find that the R{\'e}nyi Efficiency metric is highly correlated to downstream performance on question answering."
}Markdown (Informal)
[Comparative Analysis of the Intrinsic Metrics for Tokenizers and their effect on Downstream Tasks for Hindi and Marathi](https://preview.aclanthology.org/ingest-acl/2026.acl-long.1037/) (Dwivedi & Gopalan, ACL 2026)
ACL