@inproceedings{jeon-2025-beyond,
title = "Beyond Distribution: Investigating Language Models' Understanding of {S}ino-{K}orean Morphemes",
author = "Jeon, Taehee",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2025",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/author-page-yu-wang-polytechnic/2025.findings-emnlp.569/",
doi = "10.18653/v1/2025.findings-emnlp.569",
pages = "10762--10772",
ISBN = "979-8-89176-335-7",
abstract = "We investigate whether Transformer-based language models, trained solely on Hangul text, can learn the compositional morphology of Sino-Korean (SK) morphemes, which are fundamental to Korean vocabulary. Using BERT{\_}BASE and fastText, we conduct controlled experiments with target words and their ``real'' vs. ``fake'' neighbors{---}pairs that share a Hangul syllable representing the same SK morpheme vs. those that share only the Hangul syllable. Our results show that while both models{---}especially BERT{---}distinguish real and fake pairs to some extent, their performance is primarily driven by the frequency of each experimental word rather than a true understanding of SK morphemes. These findings highlight the limits of distributional learning for morpheme-level understanding and emphasize the need for explicit morphological modeling or Hanja-aware strategies to improve semantic representation in Korean language models. Our dataset and analysis code are available at: https://github.com/taeheejeon22/ko-skmorph-lm."
}Markdown (Informal)
[Beyond Distribution: Investigating Language Models’ Understanding of Sino-Korean Morphemes](https://preview.aclanthology.org/author-page-yu-wang-polytechnic/2025.findings-emnlp.569/) (Jeon, Findings 2025)
ACL