@inproceedings{panahi-etal-2026-llms,
title = "When {LLM}s Annotate: Reliability Challenges in Low-Resource {NLI}",
author = "Panahi, Solmaz and
Kelleher, John and
Nedumpozhimana, Vasudevan",
editor = "Hettiarachchi, Hansi and
Ranasinghe, Tharindu and
Plum, Alistair and
Rayson, Paul and
Mitkov, Ruslan and
Gaber, Mohamed and
Premasiri, Damith and
Tan, Fiona Anting and
Uyangodage, Lasitha",
booktitle = "Proceedings of the Second Workshop on Language Models for Low-Resource Languages ({L}o{R}es{LM} 2026)",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/manual-author-scripts/2026.loreslm-1.17/",
pages = "178--188",
ISBN = "979-8-89176-377-7",
abstract = "This paper systematically evaluates LLM reliability on the complex semantic task of Natural Language Inference (NLI) in Farsi, assessing six prominent models across eight prompt variations through a multi-dimensional framework that measures accuracy, prompt sensitivity, and intra-class consistency. Our results demonstrate that prompt design{---}particularly the order of premise and hypothesis{---}significantly impacts prediction stability. Proprietary models (Claude-Opus-4, GPT-4o) exhibit superior stability and accuracy compared to open-weight alternatives. Across all models, the `Neutral' class emerges as the most challenging and least stable category. Crucially, we redefine model instability as a diagnostic tool for benchmark quality, demonstrating that observed disagreement often reflects valid challenges to ambiguous or erroneous gold-standard labels."
}Markdown (Informal)
[When LLMs Annotate: Reliability Challenges in Low-Resource NLI](https://preview.aclanthology.org/manual-author-scripts/2026.loreslm-1.17/) (Panahi et al., LoResLM 2026)
ACL