@inproceedings{thawani-etal-2023-learn,
title = "Learn Your Tokens: Word-Pooled Tokenization for Language Modeling",
author = "Thawani, Avijit and
Ghanekar, Saurabh and
Zhu, Xiaoyuan and
Pujara, Jay",
editor = "Bouamor, Houda and
Pino, Juan and
Bali, Kalika",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2023",
month = dec,
year = "2023",
address = "Singapore",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/fix-sig-urls/2023.findings-emnlp.662/",
doi = "10.18653/v1/2023.findings-emnlp.662",
pages = "9883--9893",
abstract = "Language models typically tokenize text into subwords, using a deterministic, hand-engineered heuristic of combining characters into longer surface-level strings such as `ing' or whole words. Recent literature has repeatedly shown the limitations of such a tokenization strategy, particularly for documents not written in English and for representing numbers. On the other extreme, byte/character-level language models are much less restricted but suffer from increased sequence description lengths and a subsequent quadratic expansion in self-attention computation. Recent attempts to compress and limit these context lengths with fixed size convolutions is helpful but completely ignores the word boundary. This paper considers an alternative `learn your tokens' scheme which utilizes the word boundary to pool bytes/characters into word representations, which are fed to the primary language model, before again decoding individual characters/bytes per word in parallel. We find that our moderately expressive and moderately fast end-to-end tokenizer outperform by over `300{\%}{`} both subwords and byte/character models over the intrinsic language modeling metric of next-word prediction across datasets. It particularly outshines on rare words, outperforming by a factor of 30! We extensively study the language modeling setup for all three categories of tokenizers and theoretically analyze how our end-to-end models can also be a strong trade-off in efficiency and robustness."
}
Markdown (Informal)
[Learn Your Tokens: Word-Pooled Tokenization for Language Modeling](https://preview.aclanthology.org/fix-sig-urls/2023.findings-emnlp.662/) (Thawani et al., Findings 2023)
ACL