@inproceedings{zaghouani-2026-toward,
title = "Toward Dialect-Aware Safety Evaluation for {A}rabic Large Language Models",
author = "Zaghouani, Wajdi",
editor = "Chang, Kai-Wei and
Mehrabi, Ninareh and
Krishna, Satyapriya and
Das, Anubrata and
Dhamala, Jwala and
Cao, Yang Trista and
Kumarage, Tharindu and
Ramakrishna, Anil and
Christodoulopoulos, Christos and
Wan, Yixin and
Galystan, Aram and
Kumar, Anoop and
Gupta, Rahul",
booktitle = "Proceedings of the 6th Workshop on Trustworthy {NLP} ({T}rust{NLP} 2026)",
month = jul,
year = "2026",
address = "San Diego, California",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl-workshops/2026.trustnlp-main.37/",
pages = "503--514",
ISBN = "979-8-89176-418-7",
abstract = "Large language models (LLMs) are increasingly deployed with safety alignment mechanisms designed to prevent harmful outputs including hate speech, harassment, and unsafe instructions. However, existing safety evaluation frameworks remain heavily centered on English and standardized language varieties, creating a critical gap for languages characterized by extensive dialectal variation. Arabic provides a particularly important case: everyday communication across the Arab world occurs predominantly in regional dialects rather than Modern Standard Arabic (MSA), yet these dialects are systematically underrepresented in alignment training corpora and safety benchmarks.In this paper we introduce the Dialect Safety Gap, defined as systematic variation in LLM safety behavior across dialects of the same language. We argue that this phenomenon arises from the interaction between alignment training procedures and linguistic variation: safety alignment implicitly encodes normative patterns present in training datasets, and when dialectal forms diverge from those patterns, safety behavior degrades through lexical, morphological, and pragmatic mechanisms.We propose a formal framework grounded in algorithmic fairness that links dialect variation to alignment pipeline design, introduce both a binary DSG Score and a magnitude-aware Pairwise Dialect Inconsistency metric, and propose the Dialect-Aware Safety Evaluation Protocol (DASEP) as a practical evaluation framework. We demonstrate the feasibility of dialect-aware evaluation through a controlled, human-annotated prompt-probe experiment across five Arabic variety groups, revealing a structured gradient of safety degradation that correlates with linguistic distance from MSA."
}Markdown (Informal)
[Toward Dialect-Aware Safety Evaluation for Arabic Large Language Models](https://preview.aclanthology.org/ingest-acl-workshops/2026.trustnlp-main.37/) (Zaghouani, TrustNLP 2026)
ACL