@inproceedings{xu-etal-2025-progress,
title = "The Progress Illusion: Revisiting meta-evaluation standards of {LLM} evaluators",
author = "Xu, Tianruo Rose and
Gaur, Vedant and
Leqi, Liu and
Goyal, Tanya",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2025",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/name-variant-enfa-fane/2025.findings-emnlp.1036/",
doi = "10.18653/v1/2025.findings-emnlp.1036",
pages = "19033--19043",
ISBN = "979-8-89176-335-7",
abstract = "LLM judges have gained popularity as an inexpensive and performant substitute for human evaluation. However, we observe that the meta-evaluation setting in which the reliability of these LLM evaluators is established is substantially different from their use in model development. To address this, we revisit meta-evaluations of LLM evaluators under a setting that more closely aligns with practice by examining evaluators' ability to distinguish test system pairs that are closer in capability. Our fine-grained approach shows that all LLM evaluator{'}s correlations with human judgments are concerningly low when the models perform similarly, showcasing a key limitation of current norms. Equipped with this better methodology, we next analyze the impact that the choice of the reference model makes to LLM-as-a-judge evaluator performance. We show that single-reference evaluators only perform well at ranking test systems that fall within particular capability ranges, even if the standard meta-evaluation reports high overall correlation. Taken together, our analysis shows critical issues with current LLM meta-evaluation and recommend avenues for improvement."
}Markdown (Informal)
[The Progress Illusion: Revisiting meta-evaluation standards of LLM evaluators](https://preview.aclanthology.org/name-variant-enfa-fane/2025.findings-emnlp.1036/) (Xu et al., Findings 2025)
ACL