@inproceedings{kumar-etal-2026-revisiting,
title = "Revisiting Evaluation of Question Answering Systems in Low-Resource {I}ndic Languages: Bridging Human and Metric Alignment",
author = "Kumar, Anuj and
Ahlawat, Satyadev and
Prasad, Yamuna and
Singh, Virendra",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Proceedings of the 64th Annual Meeting of the {A}ssociation for {C}omputational {L}inguistics (Volume 2: Short Papers)",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl/2026.acl-short.58/",
pages = "703--718",
ISBN = "979-8-89176-391-3",
abstract = "Evaluating Question Answering (QA) systems in low-resource Indic languages remains challenging due to the scarcity of annotated data, high linguistic diversity, and the absence of reliable evaluation metrics. Many Indian languages are severely underrepresented, making it difficult to accurately assess the performance of Large Language Models (LLMs) on QA tasks. Commonly used metrics like BLEU, ROUGE-L, and BERTScore, while successful in machine translation and resource-rich scenarios, tend to perform poorly in low-resource QA settings. These metrics often exhibit issues such as compressed scoring ranges, excessive zero scores, and weak alignment with human judgments. To overcome these limitations, this work introduces the LRM$^{2}$QAS (Language Robust Multi-aspect Metrics for Question Answering Systems). This composite evaluation framework integrates semantic similarity, factual completeness, numerical accuracy, and contextual relevance. The proposed metric is evaluated across eight Indic-language QA tasks using multiple LLMs, as well as on open-domain benchmarks NaturalQuestions (NQ) and TriviaQA (TQ). Across all settings, LRM$^2$QAS demonstrates stronger agreement with human evaluation, as measured by Pearson, Spearman, and Kendall correlation coefficients. Experimental findings highlight that LRM$^2$QAS provides more precise distinctions between model outputs and aligns more closely with human judgment, offering a reliable framework for evaluating multilingual QA in low-resource Indic languages."
}Markdown (Informal)
[Revisiting Evaluation of Question Answering Systems in Low-Resource Indic Languages: Bridging Human and Metric Alignment](https://preview.aclanthology.org/ingest-acl/2026.acl-short.58/) (Kumar et al., ACL 2026)
ACL