@inproceedings{mubarak-etal-2025-arasafe,
title = "{A}ra{S}afe: Benchmarking Safety in {A}rabic {LLM}s",
author = "Mubarak, Hamdy and
Mohamed, Abubakr and
Hawasly, Majd",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2025",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/name-variant-enfa-fane/2025.findings-emnlp.529/",
doi = "10.18653/v1/2025.findings-emnlp.529",
pages = "9976--9992",
ISBN = "979-8-89176-335-7",
abstract = "We introduce AraSafe, the first large-scale native Arabic safety benchmark for large language models (LLMs), addressing the pressing need for culturally and linguistically representative evaluation resources. The dataset comprises 12K naturally occurring, human-written Arabic prompts containing both harmful and non-harmful content across diverse domains, including linguistics, social studies, and science. Each prompt was independently annotated by two experts into one of nine fine-grained safety categories, including `Safe/Not Harmful', `Illegal Activities', `Violence or Harm', `Privacy Violation', and `Hate Speech'. Additionally, to support training classifiers for harmful content and due to the imbalanced representation of harmful content in the natural dataset, we create a synthetic dataset of additional 12K harmful prompts generated by GPT-4o via carefully designed prompt engineering techniques. We benchmark a number of Arabic-centric and multilingual models in the 7 to 13B parameter range, including Jais, AceGPT, Allam, Fanar, Llama-3, Gemma-2, and Qwen3, as well as BERT-based fine-tuned classifier models on detecting harmful prompts. GPT-4o was used as an upper-bound reference baseline. Our evaluation reveals critical safety blind spots in Arabic LLMs and underscores the necessity of localized, culturally grounded benchmarks for building responsible AI systems."
}Markdown (Informal)
[AraSafe: Benchmarking Safety in Arabic LLMs](https://preview.aclanthology.org/name-variant-enfa-fane/2025.findings-emnlp.529/) (Mubarak et al., Findings 2025)
ACL
- Hamdy Mubarak, Abubakr Mohamed, and Majd Hawasly. 2025. AraSafe: Benchmarking Safety in Arabic LLMs. In Findings of the Association for Computational Linguistics: EMNLP 2025, pages 9976–9992, Suzhou, China. Association for Computational Linguistics.