@inproceedings{abouzeid-etal-2025-arabemonet,
title = "{A}rab{E}mo{N}et: A Lightweight Hybrid 2{D} {CNN}-{B}i{LSTM} Model with Attention for Robust {A}rabic Speech Emotion Recognition",
author = "Abouzeid, Ali and
Elbouardi, Bilal and
Maged, Mohamed and
Shehata, Shady",
editor = "Darwish, Kareem and
Ali, Ahmed and
Abu Farha, Ibrahim and
Touileb, Samia and
Zitouni, Imed and
Abdelali, Ahmed and
Al-Ghamdi, Sharefah and
Alkhereyf, Sakhar and
Zaghouani, Wajdi and
Khalifa, Salam and
AlKhamissi, Badr and
Almatham, Rawan and
Hamed, Injy and
Alyafeai, Zaid and
Alowisheq, Areeb and
Inoue, Go and
Mrini, Khalil and
Alshammari, Waad",
booktitle = "Proceedings of The Third Arabic Natural Language Processing Conference",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/name-variant-enfa-fane/2025.arabicnlp-main.17/",
doi = "10.18653/v1/2025.arabicnlp-main.17",
pages = "211--218",
ISBN = "979-8-89176-352-4",
abstract = "Speech emotion recognition is vital for human-computer interaction, particularly for low-resource languages like Arabic, which face challenges due to limited data and research. We introduce ArabEmoNet, a lightweight architecture designed to overcome these limitations and deliver state-of-the-art performance. Unlike previous systems relying on discrete MFCC features and 1D convolutions, which miss nuanced spectro-temporal patterns, ArabEmoNet uses Mel spectrograms processed through 2D convolutions, preserving critical emotional cues often lost in traditional methods. While recent models favor large-scale architectures with millions of parameters, ArabEmoNet achieves superior results with just 1 million parameters{---}90 times smaller than HuBERT base and 74 times smaller than Whisper. This efficiency makes it ideal for resource-constrained environments. ArabEmoNet advances Arabic speech emotion recognition, offering exceptional performance and accessibility for real-world applications."
}