@inproceedings{kumar-etal-2025-courteval,
title = "{C}ourt{E}val: A Courtroom-Based Multi-Agent Evaluation Framework",
author = "Kumar, Sandeep and
Nargund, Abhijit A and
Sridhar, Vivek",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2025",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingestion-acl-25/2025.findings-acl.1327/",
pages = "25875--25887",
ISBN = "979-8-89176-256-5",
abstract = "Automated evaluation is crucial for assessing the quality of natural language text, especially in open-ended generation tasks, given the costly and time-consuming nature of human evaluation. Existing automatic evaluation metrics like ROUGE and BLEU often show low correlation with human judgments. As large language models (LLMs) continue to evolve, researchers have explored their use as alternatives to human evaluators. Although single-agent approaches have shown potential, results indicate that further progress is required to close the gap between their performance and the quality of human assessments. Acknowledging that human evaluations involve multiple annotators, the multi-agent approach allows LLMs to collaborate, enhancing efficiency and effectiveness in handling complex tasks. In this paper, we present CourtEval, a novel Multi-Agent Evaluation Framework modeled after courtroom dynamics. Each agent takes on a distinct role: the Grader, similar to a judge, assigns an initial score; the Critic, like a prosecutor, challenges this score; and the Defender, akin to a defense attorney, defends it. Based on the input from both the Critic and Defender, the Grader re-evaluates the score, leading to a more balanced and fair final decision through this adversarial process. CourtEval substantially outperforms the previous state-of-the-art methods in two meta-evaluation benchmarks in NLG evaluation, SummEval and TopicalChat."
}
Markdown (Informal)
[CourtEval: A Courtroom-Based Multi-Agent Evaluation Framework](https://preview.aclanthology.org/ingestion-acl-25/2025.findings-acl.1327/) (Kumar et al., Findings 2025)
ACL