@inproceedings{okewunmi-etal-2025-evaluating,
title = "Evaluating Robustness of {LLM}s to Typographical Noise in {Y}or{\`u}b{\'a} {QA}",
author = "Okewunmi, Paul and
James, Favour and
Fajemila, Oluwadunsin",
editor = "Lignos, Constantine and
Abdulmumin, Idris and
Adelani, David",
booktitle = "Proceedings of the Sixth Workshop on African Natural Language Processing (AfricaNLP 2025)",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/display_plenaries/2025.africanlp-1.29/",
pages = "195--202",
ISBN = "979-8-89176-257-2",
abstract = "Generative AI models are primarily accessed through chat interfaces, where user queries often contain typographical errors. While these models perform well in English, their robustness to noisy inputs in low-resource languages like Yor{\`u}b{\'a} remains underexplored. This work investigates a Yor{\`u}b{\'a} question-answering (QA) task by introducing synthetic typographical noise into clean inputs. We design a probabilistic noise injection strategy that simulates realistic human typos. In our experiments, each character in a clean sentence is independently altered, with noise levels ranging from 10{\%} to 40{\%}. We evaluate performance across three strong multilingual models using two complementary metrics: (1) a multilingual BERTScore to assess semantic similarity between outputs on clean and noisy inputs, and (2) an LLM-as-judge approach, where the best Yor{\`u}b{\'a}-capable model rates fluency, comprehension, and accuracy on a 1{--}5 scale. Results show that while English QA performance degrades gradually, Yor{\`u}b{\'a} QA suffers a sharper decline. At 40{\%} noise, GPT-4o experiences over a 50{\%} drop in comprehension ability, with similar declines for Gemini 2.0 Flash and Claude 3.7 Sonnet. We conclude with recommendations for noise-aware training and dedicated noisy Yor{\`u}b{\'a} benchmarks to enhance LLM robustness in low-resource settings."
}
Markdown (Informal)
[Evaluating Robustness of LLMs to Typographical Noise in Yorùbá QA](https://preview.aclanthology.org/display_plenaries/2025.africanlp-1.29/) (Okewunmi et al., AfricaNLP 2025)
ACL