@inproceedings{alahmari-2025-sadslyc,
title = "{SADSL}y{C}: A Corpus for Saudi {A}rabian Multi-dialect Identification through Song Lyrics",
author = "Alahmari, Salwa Saad",
editor = "Ezzini, Saad and
Alami, Hamza and
Berrada, Ismail and
Benlahbib, Abdessamad and
El Mahdaouy, Abdelkader and
Lamsiyah, Salima and
Derrouz, Hatim and
Haddad Haddad, Amal and
Jarrar, Mustafa and
El-Haj, Mo and
Mitkov, Ruslan and
Rayson, Paul",
booktitle = "Proceedings of the 4th Workshop on Arabic Corpus Linguistics (WACL-4)",
month = jan,
year = "2025",
address = "Abu Dhabi, UAE",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/add-emnlp-2024-awards/2025.wacl-1.4/",
pages = "38--43",
abstract = "This paper presents the Saudi Arabian Dialects Song Lyrics Corpus (SADSLyC), the first dataset featuring song lyrics from the five major Saudi dialects: Najdi (Central Region), Hijazi (Western Region), Shamali (Northern Region), Janoubi (Southern Region), and Shargawi (Eastern Region). The dataset consists of 31,358 sentences, with each sentence representing a self-contained verse in a song, totaling 151,841 words. Additionally, we present a baseline experiment using the SaudiBERT model to classify the fine-grained dialects in the SADSLyC Corpus. The model achieved an overall accuracy of 73{\%} on the test dataset."
}
Markdown (Informal)
[SADSLyC: A Corpus for Saudi Arabian Multi-dialect Identification through Song Lyrics](https://preview.aclanthology.org/add-emnlp-2024-awards/2025.wacl-1.4/) (Alahmari, WACL 2025)
ACL