@article{cognetta-okazaki-2025-tokenization,
title = "Tokenization as Finite-State Transduction",
author = "Cognetta, Marco and
Okazaki, Naoaki",
journal = "Computational Linguistics",
volume = "51",
number = "4",
month = dec,
year = "2025",
address = "Cambridge, MA",
publisher = "MIT Press",
url = "https://preview.aclanthology.org/ingest-eacl/2025.cl-4.2/",
doi = "10.1162/coli.a.23",
pages = "1119--1149",
abstract = "Tokenization is the first step in modern neural language model pipelines where an input text is converted to a sequence of subword tokens. We introduce from first principles a finite-state transduction framework that can encode all possible tokenizations of a regular language. We then constructively show that Byte-Pair Encoding (BPE) and MaxMatch (WordPiece), two popular tokenization schemes, are also efficiently representable by simple finite-state transducers. For BPE, this is particularly surprising given that it does not tokenize strings from left to right and requires a notion of priority. We also discuss an application of subword-level pattern promotion to guided generation, where the outputs of a language model are constrained to match a specified pattern, and how tokenization-aware promotion offers a theoretical benefit to modeling."
}Markdown (Informal)
[Tokenization as Finite-State Transduction](https://preview.aclanthology.org/ingest-eacl/2025.cl-4.2/) (Cognetta & Okazaki, CL 2025)
ACL