@inproceedings{hatch-richardson-2025-semitic,
title = "{S}emitic Root Encoding: Tokenization Based on the Templatic Morphology of {S}emitic Languages in {NMT}",
author = "Hatch, Brendan T. and
Richardson, Stephen D.",
editor = "Darwish, Kareem and
Ali, Ahmed and
Abu Farha, Ibrahim and
Touileb, Samia and
Zitouni, Imed and
Abdelali, Ahmed and
Al-Ghamdi, Sharefah and
Alkhereyf, Sakhar and
Zaghouani, Wajdi and
Khalifa, Salam and
AlKhamissi, Badr and
Almatham, Rawan and
Hamed, Injy and
Alyafeai, Zaid and
Alowisheq, Areeb and
Inoue, Go and
Mrini, Khalil and
Alshammari, Waad",
booktitle = "Proceedings of The Third Arabic Natural Language Processing Conference",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-emnlp/2025.arabicnlp-main.3/",
pages = "26--41",
ISBN = "979-8-89176-352-4",
abstract = "The morphological structure of Semitic languages, such as Arabic, is based on non-concatenative roots and templates. This complex word structure used by humans is obscured to neural models that employ traditional tokenization algorithms, such as byte-pair encoding (BPE) (Sennrich et al., 2016; Gage, 1994). In this work, we present and evaluate Semitic Root Encoding (SRE), a tokenization method that represents both concatenative and non-concatenative structures in Semitic words with sequences of root, template stem, and BPE tokens. We apply the method to neural machine translation (NMT) and find that SRE tokenization yields an average increase of 1.15 BLEU over the baseline. SRE tokenization is also robust against generating combinations of roots with template stems that do not occur in nature. Finally, we compare the performance of SRE to tokenization based on non-linguistic root and template structures and tokenization based on stems, providing evidence that NMT models are capable of leveraging tokens based on non-concatenative Semitic morphology."
}