@inproceedings{kim-etal-2024-debate,
title = "{DEBATE}: Devil`s Advocate-Based Assessment and Text Evaluation",
author = "Kim, Alex and
Kim, Keonwoo and
Yoon, Sangwon",
editor = "Ku, Lun-Wei and
Martins, Andre and
Srikumar, Vivek",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2024",
month = aug,
year = "2024",
address = "Bangkok, Thailand",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/Author-page-Marten-During-lu/2024.findings-acl.112/",
doi = "10.18653/v1/2024.findings-acl.112",
pages = "1885--1897",
abstract = "As natural language generation (NLG) models have become prevalent, systematically assessing the quality of machine-generated texts has become increasingly important. Recent studies introduce LLM-based evaluators that operate as reference-free metrics, demonstrating their capability to adeptly handle novel tasks. However, these models generally rely on a single-agent approach, which, we argue, introduces an inherent limit to their performance. This is because there exist biases in LLM agent`s responses, including preferences for certain text structure or content. In this work, we propose DEBATE, an NLG evaluation framework based on multi-agent scoring system augmented with a concept of Devil`s Advocate. Within the framework, one agent is instructed to criticize other agents' arguments, potentially resolving the bias in LLM agent`s answers. DEBATE substantially outperforms the previous state-of-the-art methods in two meta-evaluation benchmarks in NLG evaluation, SummEval and TopicalChat. We also show that the extensiveness of debates among agents and the persona of an agent can influence the performance of evaluators."
}
Markdown (Informal)
[DEBATE: Devil’s Advocate-Based Assessment and Text Evaluation](https://preview.aclanthology.org/Author-page-Marten-During-lu/2024.findings-acl.112/) (Kim et al., Findings 2024)
ACL