@inproceedings{nguyen-arehalli-2026-word,
title = "Word predictability estimates from language models are not robust to tokenizer vocabulary",
author = "Nguyen, Kien and
Arehalli, Suhas",
editor = "Bonial, Claire and
Berzak, Yevgeni",
booktitle = "Proceedings of the 30th Conference on Computational Natural Language Learning",
month = jul,
year = "2026",
address = "San Diego, California, USA",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl-workshops/2026.conll-main.3/",
pages = "34--44",
ISBN = "979-8-89176-410-1",
abstract = "Much recent work has been interested in modeling language processing using measures of predictability estimated from pretrained language models. These models, however, are primarily built as language technologies rather than cognitive models, and make many design choices that may align poorly with theories of human language processing. We investigate one such choice {---} the size of the vocabulary learned by a BPE tokenizer {---} and investigate (1) its effect on the linguistic plausibility of subword units the model learns, (2) whether vocabulary size has a substantial influence on the surprisal estimates a model generates, and (3) whether those differences in surprisal translate to differences in the quality of downstream reading time predictions. We find that while vocabulary size doesn{'}t substantially affect the rate of morphologically reasonable tokenizations, it does have an impact on surprisal estimates and reading time predictions from 5-gram, LSTM, and GPT-2 language models. Moreover, we find that these differences primarily affect words that are split by the tokenizer, suggesting that psycholinguists should take care to design stimuli meant for computational modeling with subword tokenization in mind."
}Markdown (Informal)
[Word predictability estimates from language models are not robust to tokenizer vocabulary](https://preview.aclanthology.org/ingest-acl-workshops/2026.conll-main.3/) (Nguyen & Arehalli, CoNLL 2026)
ACL