@article{thmer-etal-2026-beyond,
title = "Beyond Literal Meaning: How {LLM}s Interpret Yemeni Proverbs",
author = "Thmer, Nasser and
Al-Laith, Ali and
Shoaib, Muhammad",
editor = "Piperidis, Stelios and
Bel, N{\'u}ria and
van den Heuvel, Henk and
Ide, Nancy and
Krek, Simon and
Toral, Antonio",
journal = "International Conference on Language Resources and Evaluation",
volume = "main",
month = may,
year = "2026",
address = "Palma de Mallorca, Spain",
publisher = "ELRA Language Resource Association",
url = "https://preview.aclanthology.org/ingest-lrec/2026.lrec-main.83/",
pages = "1071--1080",
abstract = "We present a benchmark Yemeni proverbs dataset paired with expert-annotated explanations, designed to evaluate the cultural reasoning abilities of large language models (LLMs). Using zero-shot and few-shot prompting, we assess seven LLMs through both automatic and human evaluation. Results show that instruction-tuned models like GPT-4o and Gemini 1.5 Pro outperform smaller models in both automatic and human evaluations. Few-shot prompting significantly improves performance across all models, underscoring its value for figurative and culturally grounded language tasks. Notably, ALLaM, a bilingual model trained on Arabic and English, achieves competitive results, demonstrating the potential of regionally adapted models for low-resource cultural tasks. LLM-as-a-Judge evaluation correlates strongly with human assessment (Kendall{'}s {\ensuremath{\tau}} up to 0.98). Error analysis identifies recurring literal interpretation and cultural misalignment as key failure modes."
}Markdown (Informal)
[Beyond Literal Meaning: How LLMs Interpret Yemeni Proverbs](https://preview.aclanthology.org/ingest-lrec/2026.lrec-main.83/) (Thmer et al., LREC 2026)
ACL