@inproceedings{junqueira-moreira-2026-inadequacy,
title = "The Inadequacy of Automatic Evaluation Metrics in Question Answering: A Case-Study in {P}ortuguese",
author = "Junqueira, J{\'u}lia da Rocha and
Moreira, Viviane P.",
editor = "Souza, Marlo and
de-Dios-Flores, Iria and
Santos, Diana and
Freitas, Larissa and
Souza, Jackson Wilke da Cruz and
Ribeiro, Eug{\'e}nio",
booktitle = "Proceedings of the 17th International Conference on Computational Processing of {P}ortuguese ({PROPOR} 2026) - Vol. 1",
month = apr,
year = "2026",
address = "Salvador, Brazil",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-dnd/2026.propor-1.54/",
pages = "551--561",
ISBN = "979-8-89176-387-6",
abstract = "Questions and answers are among the most fundamental forms of human communication. Question Answering (QA) is the task of correctly generating answers based on a context. To assess the success of the task, the answers are typically evaluated using traditional metrics such as BLEU, ROUGE, and METEOR. However, these metrics often fail to reflect the actual quality of the outputs. More recently, new evaluation metrics and the LLM-as-a-judge paradigm have also been applied to the evaluation of QA. To gain a deeper understanding of the capabilities and limitations of QA metrics, this work performs a comparative analysis of both traditional and more recent approaches for QA evaluation. Experiments were conducted on the Pir{\'a} dataset (in Portuguese) using four LLMs to generate answers. Additionally, human evaluation was performed to assess aspects such as correctness, completeness, clarity, and relevance of the generated content. We demonstrate that lexical metrics are limited in evaluating QA. We also observed that human evaluators favor models that provide higher information density, even when this contradicts prompt constraints, whereas lexical metrics penalize this verbosity. This divergence confirms that traditional metrics are insufficient for capturing the trade-off between instruction adherence and the semantic richness valued by native speakers."
}Markdown (Informal)
[The Inadequacy of Automatic Evaluation Metrics in Question Answering: A Case-Study in Portuguese](https://preview.aclanthology.org/ingest-dnd/2026.propor-1.54/) (Junqueira & Moreira, PROPOR 2026)
ACL