@inproceedings{el-ghawi-2025-web,
title = "Web-Based Corpus Compilation of the Emirati {A}rabic Dialect",
author = "El-Ghawi, Yousra A.",
editor = "Ezzini, Saad and
Alami, Hamza and
Berrada, Ismail and
Benlahbib, Abdessamad and
El Mahdaouy, Abdelkader and
Lamsiyah, Salima and
Derrouz, Hatim and
Haddad Haddad, Amal and
Jarrar, Mustafa and
El-Haj, Mo and
Mitkov, Ruslan and
Rayson, Paul",
booktitle = "Proceedings of the 4th Workshop on Arabic Corpus Linguistics (WACL-4)",
month = jan,
year = "2025",
address = "Abu Dhabi, UAE",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/jlcl-multiple-ingestion/2025.wacl-1.7/",
pages = "63--67",
abstract = "This paper displays some initial efforts conducted in the compilation pursuits of Arabic dialectal corpora in the form of raw text, the end purpose of which is to fine-tune existing Arabic large language models (LLM) to better understand and generate text in the Emirati dialect as instructed. The focus of the paper is on the process of compiling corpora from the web, which includes the exploration of possible methods, tools and techniques specific to web search, as well as examples of genres and domains to explore. The results of these efforts and the importance of native speaker contributions to corpus compilation for low-resource languages are also touched upon."
}
Markdown (Informal)
[Web-Based Corpus Compilation of the Emirati Arabic Dialect](https://preview.aclanthology.org/jlcl-multiple-ingestion/2025.wacl-1.7/) (El-Ghawi, WACL 2025)
ACL