@inproceedings{huidrom-belz-2025-using,
title = "Using {LLM} Judgements for Sanity Checking Results and Reproducibility of Human Evaluations in {NLP}",
author = "Huidrom, Rudali and
Belz, Anya",
editor = "Dhole, Kaustubh and
Clinciu, Miruna",
booktitle = "Proceedings of the Fourth Workshop on Generation, Evaluation and Metrics (GEM{\texttwosuperior})",
month = jul,
year = "2025",
address = "Vienna, Austria and virtual meeting",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/corrections-2025-08/2025.gem-1.30/",
pages = "354--365",
ISBN = "979-8-89176-261-9",
abstract = "Human-like evaluation by LLMs of NLP systems is currently attracting a lot of interest, and correlations with human reference evaluations are often remarkably strong. However, this is not always the case, for unclear reasons which means that without also meta-evaluating against human evaluations (incurring the very cost automatic evaluation is intended to avoid), we don{'}t know if an LLM-as-judge evaluation is reliable or not. In this paper, we explore a type of evaluation scenario where this may not matter, because it comes with a built-in reliability check. We apply different LLM-as-judge methods to sets of three comparable human evaluations: (i) an original human evaluation, and (ii) two reproductions of it which produce contradicting reproducibility results. We find that in each case, the different LLM-as-judge methods (i) strongly agree with each other, and (ii) strongly agree with the results of one reproduction, while strongly disagreeing with the other. In combination, we take this to mean that a set of LLMs can be used to sanity check contradictory reproducibility results \textit{if} the LLMs agree with each other, \textit{and} the agreement of the LLMs with one set of results, and the disagreement with the other, are both strong."
}
Markdown (Informal)
[Using LLM Judgements for Sanity Checking Results and Reproducibility of Human Evaluations in NLP](https://preview.aclanthology.org/corrections-2025-08/2025.gem-1.30/) (Huidrom & Belz, GEM 2025)
ACL