@inproceedings{huang-etal-2026-reasoning,
title = "Reasoning Model Is Superior {LLM}-Judge, Yet Suffers from Biases",
author = "Huang, Hui and
Wu, Xuanxin and
Yang, Muyun and
Arase, Yuki",
editor = "Akhtar, Mubashara and
Batzner, Jan and
Choshen, Leshem and
Ghosh, Avijit and
Gohar, Usman and
Mickel, Jennifer and
Pant, Ichhya and
Talat, Zeerak and
Lin, Michelle",
booktitle = "Proceedings of the Workshop on Evaluating Evaluations ({E}val{E}val)",
month = jul,
year = "2026",
address = "San Diego, CA",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl-workshops/2026.evaleval-1.13/",
pages = "70--81",
ISBN = "979-8-89176-429-3",
abstract = "This paper presents the first systematic comparison investigating whether Large Reasoning Models (LRMs) are superior judges to non-reasoning LLMs. Our empirical analysis yields four key findings: 1) LRMs outperform non-reasoning LLMs in terms of judgment accuracy, particularly on reasoning-intensive tasks; 2) LRMs demonstrate superior evaluation instruction-following capabilities; 3) LRMs exhibit enhanced robustness against adversarial attacks targeting judgment tasks; 4) However, LRMs still exhibit strong evaluation biases. To mitigate this bias vulnerability, we propose PlanJudge, a lightweight evaluation strategy that prompts the model to generate an explicit evaluation plan before executing the judgment. Despite its simplicity, our experiments demonstrate that PlanJudge significantly mitigates biases in LLM-as-a-Judge while preserving overall judgment accuracy1."
}Markdown (Informal)
[Reasoning Model Is Superior LLM-Judge, Yet Suffers from Biases](https://preview.aclanthology.org/ingest-acl-workshops/2026.evaleval-1.13/) (Huang et al., EvalEval 2026)
ACL