@inproceedings{inoue-etal-2026-diacritics,
title = "Do Diacritics Matter? Evaluating the Impact of {A}rabic Diacritics on Tokenization and {LLM} Benchmarks",
author = "Inoue, Go and
Alhafni, Bashar and
Habash, Nizar and
Baldwin, Timothy",
editor = "Demberg, Vera and
Inui, Kentaro and
Marquez, Llu{\'i}s",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {EACL} 2026",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-eacl/2026.findings-eacl.22/",
pages = "426--442",
ISBN = "979-8-89176-386-9",
abstract = "Diacritics are orthographic marks added to letters to specify pronunciation, disambiguate lexical meanings, or indicate grammatical distinctions. Diacritics can significantly influence language processing tasks, especially in languages like Arabic, where diacritic usage varies widely across domains and contexts. While diacritics provide valuable linguistic information, their presence can increase subword fragmentation during tokenization, potentially degrading the performance of NLP models. In this paper, we systematically analyze the impact of diacritics on tokenization and benchmark task performance across major Large Language Models (LLMs). Our results demonstrate that while modern LLMs show robustness to the limited diacritics naturally found in texts, full diacritization leads to substantially increased token fragmentation and degraded performance, highlighting the need for careful handling of diacritics in the future development of Arabic LLMs."
}Markdown (Informal)
[Do Diacritics Matter? Evaluating the Impact of Arabic Diacritics on Tokenization and LLM Benchmarks](https://preview.aclanthology.org/ingest-eacl/2026.findings-eacl.22/) (Inoue et al., Findings 2026)
ACL