@inproceedings{k-h-de-nardi-2026-rethinking,
title = "Rethinking Polarity Detection: When {BPE} Fails Across Scripts",
author = "K H, Manodyna and
De Nardi, Luc",
editor = "El-Haj, Mo and
Rayson, Paul and
Jarrar, Mustafa and
Ezeani, Ignatius and
Ezzini, Saad and
Ahmadi, Sina and
Haddad, Amal Haddad and
Amol, Cynthia and
Abdelali, Ahmad and
Abudalfa, Shadi",
booktitle = "Proceedings of the 2nd Workshop on {NLP} for Languages Using {A}rabic Script",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/eacl-dois/2026.abjadnlp-1.2/",
doi = "10.18653/v1/2026.abjadnlp-1.2",
pages = "6--14",
abstract = "Multilingual evaluation often relies on language coverage or translated benchmarks, implicitly assuming that subword tokenization behaves comparably across scripts. In mixed-script settings, this assumption breaks down. We examine this effect using polarity detection as a case study, comparing Orthographic Syllable Pair Encoding (OSPE) and Byte Pair Encoding (BPE) under identical architectures, data, and training conditions on SemEval Task 9, which spans Devanagari, Perso-Arabic, and Latin scripts. OSPE is applied to Hindi, Nepali, Urdu, and Arabic, while BPE is retained for English. We find that BPE systematically underestimates performance in abugida and abjad scripts, producing fragmented representations, unstable optimization, and drops of up to 27 macro-F1 points for Nepali, while English remains largely unaffected. Script-aware segmentation preserves orthographic structure, stabilizes training, and improves cross-language comparability without additional data or model scaling, highlighting tokenization as a latent but consequential evaluation decision in multilingual benchmarks. While the analysis spans multiple scripts, we place particular emphasis on Arabic and Perso-Arabic languages, where frequency-driven tokenization most severely disrupts orthographic and morphological structure."
}