@inproceedings{noureldien-etal-2025-zero,
title = "Zero-Shot and Fine-Tuned Evaluation of Generative {LLM}s for {A}rabic Word Sense Disambiguation",
author = "Noureldien, Yossra and
Mohamed, Abdelrazig and
Attallah, Farah",
editor = "Darwish, Kareem and
Ali, Ahmed and
Abu Farha, Ibrahim and
Touileb, Samia and
Zitouni, Imed and
Abdelali, Ahmed and
Al-Ghamdi, Sharefah and
Alkhereyf, Sakhar and
Zaghouani, Wajdi and
Khalifa, Salam and
AlKhamissi, Badr and
Almatham, Rawan and
Hamed, Injy and
Alyafeai, Zaid and
Alowisheq, Areeb and
Inoue, Go and
Mrini, Khalil and
Alshammari, Waad",
booktitle = "Proceedings of The Third Arabic Natural Language Processing Conference",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/author-page-yu-wang-polytechnic/2025.arabicnlp-main.24/",
doi = "10.18653/v1/2025.arabicnlp-main.24",
pages = "298--305",
ISBN = "979-8-89176-352-4",
abstract = "Arabic presents unique challenges for sense level language understanding due to its rich morphology and semantic ambiguity. This paper benchmarks large generative language models (LLMs) for Arabic Word Sense Disambiguation (WSD) under both zero-shot and fine-tuning conditions. We evaluate one proprietary model (GPT-4o) and three opensource models (LLaMA 3.1-8B, Qwen 2.5-7B, and Gemma 2-9B) on two publicly available datasets. In zero-shot settings, GPT-4o achieved the highest overall performance, with comparable results across both datasets, reaching 79{\%} accuracy and an average macro-F1 score of 66.08{\%}. Fine-tuning, however, notably elevated all open models beyond GPT4o{'}s zero-shot results. Qwen achieved the top scores on one dataset, with an accuracy of 90.77{\%} and a macro-F1 score of 83.98{\%}, while LLaMA scored highest on the other, reaching an accuracy of 88.51{\%} and a macroF1 score of 69.41{\%}. These findings demonstrate that parameter-efficient supervised adaptation can close much of the performance gap and establish strong, reproducible baselines for Arabic WSD using open-source, relatively medium-sized models. Full code is publicly available."
}