@inproceedings{mohamed-eida-habash-2025-beyond,
title = "Beyond {C}airo: {S}a{'}idi {E}gyptian {A}rabic Literary Corpus Construction and Analysis",
author = "Mohamed Eida, Mai and
Habash, Nizar",
editor = {H{\"a}m{\"a}l{\"a}inen, Mika and
{\"O}hman, Emily and
Bizzoni, Yuri and
Miyagawa, So and
Alnajjar, Khalid},
booktitle = "Proceedings of the 5th International Conference on Natural Language Processing for Digital Humanities",
month = may,
year = "2025",
address = "Albuquerque, USA",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/fix-sig-urls/2025.nlp4dh-1.26/",
pages = "292--304",
ISBN = "979-8-89176-234-3",
abstract = "Egyptian Arabic (EA) NLP resources have mainly focused on Cairene Egyptian Arabic (CEA), leaving sub-dialects like Sa{'}idi Egyptian Arabic (SEA) underrepresented. This paper introduces the first SEA corpus {--} an open-source, 4-million-word literary dataset of a dialect spoken by {\textasciitilde}30 million Egyptians. To validate its representation, we analyze SEA-specific linguistic features from dialectal surveys, confirming a higher prevalence in our corpus compared to existing EA datasets. Our findings offer insights into SEA{'}s orthographic representation in morphology, phonology, and lexicon, incorporating CODA* guidelines for normalization."
}
Markdown (Informal)
[Beyond Cairo: Sa’idi Egyptian Arabic Literary Corpus Construction and Analysis](https://preview.aclanthology.org/fix-sig-urls/2025.nlp4dh-1.26/) (Mohamed Eida & Habash, NLP4DH 2025)
ACL