@inproceedings{lerner-yvon-2025-unlike,
title = "Unlike ``Likely'', ``Unlike'' is Unlikely: {BPE}-based Segmentation hurts Morphological Derivations in {LLM}s",
author = "Lerner, Paul and
Yvon, Fran{\c{c}}ois",
editor = "Rambow, Owen and
Wanner, Leo and
Apidianaki, Marianna and
Al-Khalifa, Hend and
Eugenio, Barbara Di and
Schockaert, Steven",
booktitle = "Proceedings of the 31st International Conference on Computational Linguistics",
month = jan,
year = "2025",
address = "Abu Dhabi, UAE",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/fix-sig-urls/2025.coling-main.348/",
pages = "5181--5190",
abstract = "Large Language Models (LLMs) rely on subword vocabularies to process and generate text. However, because subwords are marked as initial- or intra-word, we find that LLMs perform poorly at handling some types of affixations, which hinders their ability to generate novel (unobserved) word forms. The largest models trained on enough data can mitigate this tendency because their initial- and intra-word embeddings are aligned; in-context learning also helps when all examples are selected in a consistent way; but only morphological segmentation can achieve a near-perfect accuracy."
}
Markdown (Informal)
[Unlike “Likely”, “Unlike” is Unlikely: BPE-based Segmentation hurts Morphological Derivations in LLMs](https://preview.aclanthology.org/fix-sig-urls/2025.coling-main.348/) (Lerner & Yvon, COLING 2025)
ACL