@inproceedings{toughrai-etal-2025-abdul,
title = "{ABDUL}: A New Approach to Build Language Models for Dialects Using Formal Language Corpora Only",
author = {Toughrai, Yassine and
Sma{\"i}li, Kamel and
Langlois, David},
editor = "Nguyen, Duc",
booktitle = "Proceedings of the 1st Workshop on Language Models for Underserved Communities (LM4UC 2025)",
month = may,
year = "2025",
address = "Albuquerque, New Mexico",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/landing_page/2025.lm4uc-1.3/",
pages = "16--21",
ISBN = "979-8-89176-242-8",
abstract = "Arabic dialects present major challenges for natural language processing (NLP) due to their diglossic nature, phonetic variability, and the scarcity of resources. To address this, we introduce a phoneme-like transcription approach that enables the training of robust language models for North African Dialects (NADs) using only formal language data, without the need for dialect-specific corpora.Our key insight is that Arabic dialects are highly phonetic, with NADs particularly influenced by European languages. This motivated us to develop a novel approach in which we convert Arabic script into a Latin-based representation, allowing our language model, ABDUL, to benefit from existing Latin-script corpora.Our method demonstrates strong performance in multi-label emotion classification and named entity recognition (NER) across various Arabic dialects. ABDUL achieves results comparable to or better than specialized and multilingual models such as DarijaBERT, DziriBERT, and mBERT. Notably, in the NER task, ABDUL outperforms mBERT by 5{\%} in F1-score for Modern Standard Arabic (MSA), Moroccan, and Algerian Arabic, despite using a vocabulary four times smaller than mBERT."
}
Markdown (Informal)
[ABDUL: A New Approach to Build Language Models for Dialects Using Formal Language Corpora Only](https://preview.aclanthology.org/landing_page/2025.lm4uc-1.3/) (Toughrai et al., LM4UC 2025)
ACL