@inproceedings{lee-etal-2026-morpheme,
title = "Morpheme Matters: Morpheme-Based Subword Tokenization for {K}orean Language Models",
author = "Lee, DongHyeok and
Park, Jeongyeon and
Cho, Kyungbeen and
Lee, Jae Sung",
editor = "Demberg, Vera and
Inui, Kentaro and
Marquez, Llu{\'i}s",
booktitle = "Proceedings of the 19th Conference of the {E}uropean Chapter of the {A}ssociation for {C}omputational {L}inguistics (Volume 2: Short Papers)",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-eacl/2026.eacl-short.22/",
pages = "297--306",
ISBN = "979-8-89176-381-4",
abstract = "Tokenization plays a crucial role in the performance of language models. However, most existing tokenizers rely on frequency-based segmentation, which fails to capture the morphological structure of languages and often leads to inefficient token representations. In this study, we propose a novel tokenization method that emphasizes the importance of Korean morphological structures in eojeol (Korean spacing unit). This method is designed to accommodate both inter-eojeol segmentation and intra-eojeol segmentation, enabling the selection of subwords based on morphemes. We pretrained a language model using the proposed method and evaluated its performance on Korean benchmark tasks. Experimental results demonstrate that the proposed method generally outperforms existing approaches. Notably, it produces significantly fewer tokens per input sequence, indicating its effectiveness and efficiency for Korean language modeling. The code is available at https://github.com/Dohy-Lee/mob."
}Markdown (Informal)
[Morpheme Matters: Morpheme-Based Subword Tokenization for Korean Language Models](https://preview.aclanthology.org/ingest-eacl/2026.eacl-short.22/) (Lee et al., EACL 2026)
ACL