@inproceedings{talafha-etal-2026-zero,
title = "Zero-Shot Context-Aware {ASR} for Diverse {A}rabic Varieties",
author = "Talafha, Bashar and
Alhassan, Amin Abu and
Abdul-Mageed, Muhammad",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {ACL} 2026",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl/2026.findings-acl.1296/",
pages = "26029--26044",
ISBN = "979-8-89176-395-1",
abstract = "Zero-shot ASR for Arabic remains challenging: while multilingual models perform well on Modern Standard Arabic (MSA), error rates rise sharply on dialectal and accented speech due to linguistic mismatch and scarce labeled data. We study $\textit{context-aware}$ decoding as a lightweight test-time adaptation paradigm that conditions inference on external side information without parameter updates. For promptable encoder{--}decoder ASR (e.g., Whisper), we incorporate context through (i) decoder prompting with first-pass hypotheses and (ii) encoder/decoder prefixing with retrieved speech-text exemplars, complemented by simple prompt reordering and optional speaker-matched synthetic exemplars to improve robustness in informal and multi-speaker settings. To extend contextual adaptation beyond promptable architectures, we introduce $\textit{proxy-guided $n$-best selection}$ for CTC ASR: given one or more external proxy hypotheses, we select from a model{'}s $n$-best list by minimizing text-level distance to the proxies, enabling contextual inference without direct prompting. Across ten Arabic conditions spanning MSA, accented MSA, and multiple dialects, the best-performing context-aware variants yield average relative WER reductions of 22.29{\%} on MSA, 20.54{\%} on accented MSA, and 9.15{\%} on dialectal Arabic. For CTC ASR on our Common Voice MSA testbed, proxy-guided selection reduces WER by 15.6{\%} relative and recovers a substantial fraction of oracle $n$-best gains, showing that external-context guidance can also benefit non-promptable ASR."
}Markdown (Informal)
[Zero-Shot Context-Aware ASR for Diverse Arabic Varieties](https://preview.aclanthology.org/ingest-acl/2026.findings-acl.1296/) (Talafha et al., Findings 2026)
ACL
- Bashar Talafha, Amin Abu Alhassan, and Muhammad Abdul-Mageed. 2026. Zero-Shot Context-Aware ASR for Diverse Arabic Varieties. In Findings of the Association for Computational Linguistics: ACL 2026, pages 26029–26044, San Diego, California, United States. Association for Computational Linguistics.