@inproceedings{lanz-pecina-2026-tokenization,
title = "Tokenization Granularity and Medical Term Representations in Language Models",
author = "Lanz, Vojtech and
Pecina, Pavel",
editor = "Demner-Fushman, Dina and
Ananiadou, Sophia and
Roberts, Kirk and
Tsujii, Junichi",
booktitle = "{B}io{NLP} 2026",
month = jul,
year = "2026",
address = "San Diego, California",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl-workshops/2026.bionlp-1.45/",
pages = "559--571",
ISBN = "979-8-89176-434-7",
abstract = "We investigate how tokenization granularity affects the representation of medical terminology in language models. Prior work links tokenization granularity to downstream performance under contextualized settings for specifically pretrained and fine-tuned models. We instead ask whether this relationship already emerges at the level of isolated term representations across existing pretrained models. We introduce an intrinsic definition retrieval task using UMLS term-definition pairs, with comparison to WordNet. We show that despite substantially heavier fragmentation of medical terminology, the models remain relatively robust in maintaining semantic alignment between medical terms and their definitions. At the same time, tokenization granularity still correlates with retrieval performance, indicating that effects previously observed in downstream biomedical tasks are already reflected at the level of isolated term representations. Encoder models benefit primarily from whole-token preservation, while for decoder LLMs, tokenization effects emerge mainly at deeper retrieval ranks."
}Markdown (Informal)
[Tokenization Granularity and Medical Term Representations in Language Models](https://preview.aclanthology.org/ingest-acl-workshops/2026.bionlp-1.45/) (Lanz & Pecina, BioNLP 2026)
ACL