@inproceedings{keramati-etal-2026-confident,
title = "The Confident Liar: Diagnosing Multi-Agent Debate with Log-Probabilities and {LLM}-as-Judge",
author = "Keramati, Ali and
Cheok, Justin and
Horne, Jacob and
Warschauer, Mark",
editor = "T.Y.S.S., Santosh and
Rodriguez, Juan Diego and
de Gibert, Ona",
booktitle = "Proceedings of the 64th Annual Meeting of the {A}ssociation for {C}omputational {L}inguistics ({ACL} 2026)",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl/2026.acl-srw.121/",
pages = "1361--1375",
ISBN = "979-8-89176-393-7",
abstract = "Multi-agent debate systems are typically evaluated only on whether thefinal answer is correct, overlooking the quality of the intermediatereasoning that debate is designed to produce. This paper studies therelationship between three signals in multi-agent debate: token-levellog-probability distributions over reasoning tokens, LLM-as-judge rubricscores assigned to those tokens, and final task accuracy. We examinewhether internal confidence signals predict externally evaluated reasoningquality, and whether either signal aligns with task correctness, acrossthree domains: rubric-based scoring, mathematical reasoning, and factualquestion answering. Our framework pairs a two-agent debate architecture{---}a Constructor and an Auditor{---}with anLLM-as-judge that scores each agent{'}s reasoning along instructionfollowing, justification quality, and evidence grounding, together with acritical-failure flag. Experiments in the rubric-scoring domain reveal aconsistent four-phase confidence trajectory and a substantial roleasymmetry: confidence aligns with judged reasoning quality roughly twiceas strongly for the Constructor as for the Auditor, and confidence-based detection ofcritical reasoning failures is markedly more reliable for the Constructor(AUROC 0.804) than for the Auditor (0.634). These findings motivate thebroader cross-domain investigation proposed in this paper."
}Markdown (Informal)
[The Confident Liar: Diagnosing Multi-Agent Debate with Log-Probabilities and LLM-as-Judge](https://preview.aclanthology.org/ingest-acl/2026.acl-srw.121/) (Keramati et al., ACL 2026)
ACL