@inproceedings{badshah-sajjad-2025-reference,
title = "Reference-Guided Verdict: {LLM}s-as-Judges in Automatic Evaluation of Free-Form {QA}",
author = "Badshah, Sher and
Sajjad, Hassan",
editor = "Zhang, Chen and
Allaway, Emily and
Shen, Hua and
Miculicich, Lesly and
Li, Yinqiao and
M'hamdi, Meryem and
Limkonchotiwat, Peerat and
Bai, Richard He and
T.y.s.s., Santosh and
Han, Sophia Simeng and
Thapa, Surendrabikram and
Rim, Wiem Ben",
booktitle = "Proceedings of the 9th Widening NLP Workshop",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-emnlp/2025.winlp-main.37/",
pages = "251--267",
ISBN = "979-8-89176-351-7",
abstract = "The emergence of Large Language Models (LLMs) as chat assistants capable of generating human-like conversations has amplified the need for robust evaluation methods, particularly for open-ended tasks. Conventional metrics such as EM and F1, while useful, are inadequate for capturing the full semantics and contextual depth of such generative outputs. We propose a reference-guided verdict method that automates the evaluation process by leveraging multiple LLMs as judges. Through experiments on free-form question-answering tasks, we demonstrate that combining multiple models improves the reliability and accuracy of evaluations, especially in tasks where a single model may struggle. The results indicate a strong correlation with human evaluations, establishing the proposed method as a reliable alternative to traditional metrics."
}Markdown (Informal)
[Reference-Guided Verdict: LLMs-as-Judges in Automatic Evaluation of Free-Form QA](https://preview.aclanthology.org/ingest-emnlp/2025.winlp-main.37/) (Badshah & Sajjad, WiNLP 2025)
ACL