@inproceedings{samo-merlo-2026-modelling,
title = "Modelling the Morphology of Verbal Paradigms: A Case Study in the Tokenization of {T}urkish and {H}ebrew",
author = "Samo, Giuseppe and
Merlo, Paola",
editor = {Oflazer, Kemal and
K{\"o}ksal, Abdullatif and
Varol, Onur},
booktitle = "Proceedings of the Second Workshop Natural Language Processing for {T}urkic Languages ({SIGTURK} 2026)",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/manual-author-scripts/2026.sigturk-1.8/",
pages = "82--94",
ISBN = "979-8-89176-370-8",
abstract = "In this paper, we investigate how transformer models represent complex verb paradigms in Turkish and Modern Hebrew, focusing on how tokenization strategies shape this ability. Using the Blackbird Language Matrices task on natural data, we show that for Turkish{---}with its transparent morphological markers{---}both monolingual and multilingual models succeed either when tokenization is highly atomic or breaking words into small subword units. For Hebrew, however, a multilingual model using character-level tokenization fails to capture its non-concatenative morphology, while a monolingual model with unified morpheme-aware segmentation excels. Performance improves on more synthetic datasets, in all models."
}Markdown (Informal)
[Modelling the Morphology of Verbal Paradigms: A Case Study in the Tokenization of Turkish and Hebrew](https://preview.aclanthology.org/manual-author-scripts/2026.sigturk-1.8/) (Samo & Merlo, SIGTURK 2026)
ACL