@inproceedings{akra-etal-2025-active,
title = "Active Learning for Multidialectal {A}rabic {POS} Tagging",
author = "Akra, Diyam and
Khalilia, Mohammed and
Jarrar, Mustafa",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2025",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/author-page-yu-wang-polytechnic/2025.findings-emnlp.1359/",
doi = "10.18653/v1/2025.findings-emnlp.1359",
pages = "24960--24973",
ISBN = "979-8-89176-335-7",
abstract = "Multidialectal Arabic POS tagging is challenging due to the morphological richness and high variability among dialects. While POS tagging for MSA has advanced thanks to the availability of annotated datasets, creating similar resources for dialects remains costly and labor-intensive. Increasing the size of annotated datasets does not necessarily result in better performance. Active learning offers a more efficient alternative by prioritizing annotating the most informative samples. This paper proposes an active learning approach for multidialectal Arabic POS tagging. Our experiments revealed that annotating approximately 15,000 tokens is sufficient for high performance. We further demonstrate that using a fine-tuned model from one dialect to guide the selection of initial samples from another dialect accelerates convergence{---}reducing the annotation requirement by about 2,000 tokens. In conclusion, we propose an active learning pipeline and demonstrate that, upon reaching its defined stopping point of 16,000 annotated tokens, it achieves an accuracy of 97.6{\%} on the Emirati Corpus."
}Markdown (Informal)
[Active Learning for Multidialectal Arabic POS Tagging](https://preview.aclanthology.org/author-page-yu-wang-polytechnic/2025.findings-emnlp.1359/) (Akra et al., Findings 2025)
ACL