@inproceedings{aboukozzana-etal-2025-amcrawl,
title = "{AMC}rawl: An {A}rabic Web-Scale Dataset of Interleaved Image-Text Documents and Image-Text Pairs",
author = "Aboukozzana, Shahad and
Khan, Muhammad Kamran J and
Ali, Ahmed",
editor = "Darwish, Kareem and
Ali, Ahmed and
Abu Farha, Ibrahim and
Touileb, Samia and
Zitouni, Imed and
Abdelali, Ahmed and
Al-Ghamdi, Sharefah and
Alkhereyf, Sakhar and
Zaghouani, Wajdi and
Khalifa, Salam and
AlKhamissi, Badr and
Almatham, Rawan and
Hamed, Injy and
Alyafeai, Zaid and
Alowisheq, Areeb and
Inoue, Go and
Mrini, Khalil and
Alshammari, Waad",
booktitle = "Proceedings of The Third Arabic Natural Language Processing Conference",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-emnlp/2025.arabicnlp-main.37/",
pages = "448--465",
ISBN = "979-8-89176-352-4",
abstract = "In this paper, we present the Arabic Multimodal Crawl (AMCrawl), the first native-based Arabic multimodal dataset to our knowledge, derived from the Common Crawl corpus and rigorously filtered for quality and safety. Image-text pair datasets are the standard choice for pretraining multimodal large language models. However, they are often derived from image alt-text metadata, which is typically brief and context-poor, disconnecting images from their broader meaning. Although significant advances have been made in building interleaved image-text datasets for English, such as the OBELICS dataset, a substantial gap remains for native Arabic content. Our processing covered 8.6 million Arabic web pages, yielding 5.8 million associated images and 1.3 billion text tokens. The final dataset includes interleaved image-text documents and question-answer pairs, featuring 2.8 million high-quality interleaved documents and 5 million QA pairs. Alongside the dataset, we release the complete pipeline and code, ensuring reproducibility and encouraging further research and development. To demonstrate the effectiveness of AMCrawl, we introduce a publicly available native Arabic Vision Language model, trained with 13 billion parameters. These models achieve competitive results when benchmarked against publicly available datasets. AMCrawl bridges a critical gap in Arabic multimodal resources, providing a robust foundation for developing Arabic multimodal large language models and fostering advancements in this underrepresented area. Code: github.com/shahad-aboukozzana/AMCrawl"
}