@inproceedings{zhou-etal-2024-llm,
title = "Is {LLM} a Reliable Reviewer? A Comprehensive Evaluation of {LLM} on Automatic Paper Reviewing Tasks",
author = "Zhou, Ruiyang and
Chen, Lu and
Yu, Kai",
editor = "Calzolari, Nicoletta and
Kan, Min-Yen and
Hoste, Veronique and
Lenci, Alessandro and
Sakti, Sakriani and
Xue, Nianwen",
booktitle = "Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)",
month = may,
year = "2024",
address = "Torino, Italia",
publisher = "ELRA and ICCL",
url = "https://preview.aclanthology.org/fix-sig-urls/2024.lrec-main.816/",
pages = "9340--9351",
abstract = "The use of large language models (LLM), especially ChatGPT, to help with research has come into practice. Researchers use it for timely advice and hope to obtain in-depth feedback. However, can LLM be a qualified and reliable reviewer? Although there already exist several review-related datasets, few works have carefully and thoroughly inspected model{'}s capability as a reviewer, especially the correctness of generated reviews. In this paper, we first evaluate GPT-3.5 and GPT-4 (the current top-performing LLM) on 2 types of tasks under different settings: the score prediction task and the review generation task. In addition, we propose a dataset containing 197 review-revision multiple-choice questions (RR-MCQ) with detailed labels from the review-rebuttal forum in ICLR-2023. By asking questions from technical details to the overall presentation and quality, our RR-MCQ data provides a more complete model ability assessment. The results show that LLM is generally helpful, but great caution is needed as it always makes mistakes. Although it can give passable decisions ({\ensuremath{>}} 60{\%} accuracy) on single options, completely correct answers are still rare (about 20{\%}); models are still weak on long paper processing, zero-shot scoring, and giving critical feedback like human reviewers."
}
Markdown (Informal)
[Is LLM a Reliable Reviewer? A Comprehensive Evaluation of LLM on Automatic Paper Reviewing Tasks](https://preview.aclanthology.org/fix-sig-urls/2024.lrec-main.816/) (Zhou et al., LREC-COLING 2024)
ACL