@inproceedings{olsen-etal-2023-arabic,
title = "{A}rabic dialect identification: An in-depth error analysis on the {MADAR} parallel corpus",
author = "Olsen, Helene and
Touileb, Samia and
Velldal, Erik",
editor = "Sawaf, Hassan and
El-Beltagy, Samhaa and
Zaghouani, Wajdi and
Magdy, Walid and
Abdelali, Ahmed and
Tomeh, Nadi and
Abu Farha, Ibrahim and
Habash, Nizar and
Khalifa, Salam and
Keleg, Amr and
Haddad, Hatem and
Zitouni, Imed and
Mrini, Khalil and
Almatham, Rawan",
booktitle = "Proceedings of ArabicNLP 2023",
month = dec,
year = "2023",
address = "Singapore (Hybrid)",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/jlcl-multiple-ingestion/2023.arabicnlp-1.30/",
doi = "10.18653/v1/2023.arabicnlp-1.30",
pages = "370--384",
abstract = "This paper provides a systematic analysis and comparison of the performance of state-of-the-art models on the task of fine-grained Arabic dialect identification using the MADAR parallel corpus. We test approaches based on pre-trained transformer language models in addition to Naive Bayes models with a rich set of various features. Through a comprehensive data- and error analysis, we provide valuable insights into the strengths and weaknesses of both approaches. We discuss which dialects are more challenging to differentiate, and identify potential sources of errors. Our analysis reveals an important problem with identical sentences across dialect classes in the test set of the MADAR-26 corpus, which may confuse any classifier. We also show that none of the tested approaches captures the subtle distinctions between closely related dialects."
}
Markdown (Informal)
[Arabic dialect identification: An in-depth error analysis on the MADAR parallel corpus](https://preview.aclanthology.org/jlcl-multiple-ingestion/2023.arabicnlp-1.30/) (Olsen et al., ArabicNLP 2023)
ACL