@inproceedings{liao-shi-2026-tokenization,
title = "How Tokenization Limits Phonological Knowledge Representation in Language Models and How to Improve Them",
author = "Liao, Disen and
Shi, Freda",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Proceedings of the 64th Annual Meeting of the {A}ssociation for {C}omputational {L}inguistics (Volume 1: Long Papers)",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl/2026.acl-long.634/",
pages = "13921--13938",
ISBN = "979-8-89176-390-6",
abstract = "Tokenization is the first step in every language model (LM), yet it never takes the sounds of words into account. We investigate how tokenization influences text-only LMs' ability to represent phonological knowledge. Through a series of probing experiments, we show that subword-based tokenization systematically weakens the encoding of both local (e.g., rhyme) and global (e.g., syllabification) phonological features. To quantify this effect, we introduce the syllabification-tokenization alignment distance (STAD), a metric that measures the misalignment between a model{'}s tokenization and the natural syllable boundaries of words, and find that higher misalignment correlates with poorer phonological representations, providing a simple diagnostic for phonology-aware tokenization. To address these limitations, we propose a lightweight IPA-based fine-tuning method that infuses phonological awareness into LMs, leading to consistent improvements across three phonology-related tasks while largely preserving math and general reasoning ability, with 1.1{\%} and 0.9{\%} drops on GSM8K and MMLU, respectively.[Our code is available at {\ensuremath{<}}https://github.com/liaodisen/Tokenization-Phonology{\ensuremath{>}}]"
}Markdown (Informal)
[How Tokenization Limits Phonological Knowledge Representation in Language Models and How to Improve Them](https://preview.aclanthology.org/ingest-acl/2026.acl-long.634/) (Liao & Shi, ACL 2026)
ACL