@inproceedings{li-etal-2026-hypoeval,
title = "{H}ypo{E}val: Hypothesis-Guided Evaluation for Natural Language Generation",
author = "Li, Mingxuan and
Li, Hanchen and
Tan, Chenhao",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Proceedings of the 64th Annual Meeting of the {A}ssociation for {C}omputational {L}inguistics (Volume 1: Long Papers)",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl/2026.acl-long.1963/",
pages = "42424--42443",
ISBN = "979-8-89176-390-6",
abstract = "Large language models (LLMs) have demonstrated great potential for automating the evaluation of natural language generation. Previous frameworks of LLM-as-a-judge fall short in two ways: they either use zero-shot setting without consulting any human input, which leads to low alignment, or fine-tune LLMs on labeled data, which requires a non-trivial number of samples. Moreover, previous methods often provide little reasoning behind automated evaluations. In this paper, we propose HypoEval, Hypothesis-guided Evaluation framework, which first uses a small corpus of human evaluations to generate more detailed rubrics for human judgments and then incorporates a checklist-like approach to combine LLM{'}s assigned scores on each decomposed dimension to acquire overall scores. With only 30 human evaluations, HypoEval achieves state-of-the-art performance in alignment with both human rankings (Spearman correlation) and human scores (Pearson correlation), on average outperforming G-Eval by 11.86{\%} and fine-tuned Llama-3.1-8B-Instruct with at least 3 times more human evaluations by 11.95{\%}. Furthermore, we conduct systematic studies to assess the robustness of HypoEval, highlighting its effectiveness as a reliable and interpretable automated evaluation framework."
}Markdown (Informal)
[HypoEval: Hypothesis-Guided Evaluation for Natural Language Generation](https://preview.aclanthology.org/ingest-acl/2026.acl-long.1963/) (Li et al., ACL 2026)
ACL