@inproceedings{jon-etal-2026-current,
title = "Current state of {LLM}s for {A}rabic dialectal machine translation",
author = "Jon, Josef and
Bondok, Rawan and
Bojar, Ond{\v{r}}ej",
booktitle = "Proceedings of the 2nd Workshop on {NLP} for Languages Using {A}rabic Script",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/manual-author-scripts/2026.abjadnlp-1.41/",
pages = "329--363",
abstract = "This work presents an evaluation of large language models (LLMs) for English to dialectal Arabic machine translation on the MADAR dataset. We evaluate both translation directions (English to Arabic and vice-versa) on 16 Arabic dialects. Our experiments cover a diverse set of models, including specialized Arabic models (Jais, Nile), multilingual models (Gemma, Command-R, Mistral, Aya), and commercial APIs (GPT-4.1). We employ multiple evaluation metrics: BLEU, CHRF, COMET (both reference-based and reference-less variants) and GEMBA (LLM-as-a-judge), as well as a small-scale manual evaluation, to assess translation quality. We discuss the challenges of automatic MT evaluation, especially in the context of Arabic dialects. We also evaluate the ability of LLMs to classify the dialect used in a text. The study offers insights into the capabilities and limitations of current LLMs for dialectal Arabic machine translation, particularly highlighting the difficulty of handling dialectal diversity, although the results may be influenced by possible training data contamination, which is always a concern with LLMs."
}Markdown (Informal)
[Current state of LLMs for Arabic dialectal machine translation](https://preview.aclanthology.org/manual-author-scripts/2026.abjadnlp-1.41/) (Jon et al., AbjadNLP 2026)
ACL