@inproceedings{liu-etal-2024-empirical,
title = "An Empirical Analysis on Large Language Models in Debate Evaluation",
author = "Liu, Xinyi and
Liu, Pinxin and
He, Hangfeng",
editor = "Ku, Lun-Wei and
Martins, Andre and
Srikumar, Vivek",
booktitle = "Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers)",
month = aug,
year = "2024",
address = "Bangkok, Thailand",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/add-emnlp-2024-awards/2024.acl-short.44/",
doi = "10.18653/v1/2024.acl-short.44",
pages = "470--487",
abstract = "In this study, we investigate the capabilities and inherent biases of advanced large language models (LLMs) such as GPT-3.5 and GPT-4 in the context of debate evaluation. We discover that LLM`s performance exceeds humans and surpasses the performance of state-of-the-art methods fine-tuned on extensive datasets. We additionally explore and analyze biases present in LLMs, including positional bias, lexical bias, order bias, which may affect their evaluative judgments. Our findings reveal a consistent bias in both GPT-3.5 and GPT-4 towards the second candidate response presented, attributed to prompt design. We also uncover a lexical bias in both GPT-3.5 and GPT-4, especially when label sets carry connotations such as numerical or sequential, highlighting the critical need for careful label verbalizer selection in prompt design. Additionally, our analysis indicates a tendency of both models to favor the debate`s concluding side as the winner, suggesting an end-of-discussion bias."
}
Markdown (Informal)
[An Empirical Analysis on Large Language Models in Debate Evaluation](https://preview.aclanthology.org/add-emnlp-2024-awards/2024.acl-short.44/) (Liu et al., ACL 2024)
ACL