@inproceedings{belayachi-mazroui-2026-alkhalil,
title = "Alkhalil Corpus: An Open-Source Thematic and Lemmatized Corpus for {M}odern {S}tandard {A}rabic",
author = "Belayachi, Samir and
Mazroui, Azzeddine",
booktitle = "Proceedings of the 2nd Workshop on {NLP} for Languages Using {A}rabic Script",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/manual-author-scripts/2026.abjadnlp-1.27/",
pages = "192--197",
abstract = "The availability of large annotated corpora remains a major challenge for the development of natural language processing systems for under-resourced languages such as Arabic. In this paper, we present two annotated corpora dedicated to Modern Standard Arabic. These corpora are open-source and freely available on the Hugging Face platform. The first corpus, annotated by theme and designed to provide a balanced representation of contemporary Arabic usage, comprises approximately 76 million words collected from diverse sources covering multiple domains and geographical regions. The second corpus, containing approximately one million words, is a sub-corpus extracted from the first. It was annotated with lemma tags using a semi-automatic approach that combines automatic annotation with the Alkhalil lemmatizer and MADAMIRA, followed by manual validation."
}Markdown (Informal)
[Alkhalil Corpus: An Open-Source Thematic and Lemmatized Corpus for Modern Standard Arabic](https://preview.aclanthology.org/manual-author-scripts/2026.abjadnlp-1.27/) (Belayachi & Mazroui, AbjadNLP 2026)
ACL