@inproceedings{wu-etal-2025-co,
title = "Co-Eval: Augmenting {LLM}-based Evaluation with Machine Metrics",
author = "Wu, Ling-I and
Wu, Weijie and
Chen, Minyu and
Xue, Jianxin and
Li, Guoqiang",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-emnlp/2025.emnlp-main.1307/",
pages = "25765--25787",
ISBN = "979-8-89176-332-6",
abstract = "Large language models (LLMs) are increasingly used as evaluators in natural language generation tasks, offering advantages in scalability and interpretability over traditional evaluation methods. However, existing LLM-based evaluations often suffer from biases and misalignment, particularly in domain-specific tasks, due to limited functional understanding and knowledge gaps. To address these challenges, we first investigate the relationship between an LLM-based evaluator{'}s familiarity with the target task and its evaluation performance. We then introduce the Co-Eval framework, which leverages a criteria planner model and optimized machine metrics to enhance the scalability and fairness of LLM-based evaluation. Experimental results on both general and domain-specific tasks demonstrate that Co-Eval reduces biases, achieving up to a 0.4903 reduction in self-preference bias, and improves alignment with human preferences, with gains of up to 0.324 in Spearman correlation."
}Markdown (Informal)
[Co-Eval: Augmenting LLM-based Evaluation with Machine Metrics](https://preview.aclanthology.org/ingest-emnlp/2025.emnlp-main.1307/) (Wu et al., EMNLP 2025)
ACL