@inproceedings{datta-baitalik-2026-confidence,
title = "Confidence as a Tie-Breaker: Reassessing Multilingual Hedging Bias in {LLM}-as-a-Judge Evaluation",
author = "Datta, Rajashik and
Baitalik, Sanjan",
editor = "T.Y.S.S., Santosh and
Rodriguez, Juan Diego and
de Gibert, Ona",
booktitle = "Proceedings of the 64th Annual Meeting of the {A}ssociation for {C}omputational {L}inguistics ({ACL} 2026)",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl/2026.acl-srw.33/",
pages = "393--402",
ISBN = "979-8-89176-393-7",
abstract = "LLM judges are often used to score generated answers, but their decisions may be affected by surface style rather than semantic correctness. We introduce PolyJudge-Uncertain, a controlled benchmark for studying multilingual hedging effects in LLM-as-a-judge evaluation. The benchmark contains 5,120 short factual QA instances across English, Hindi, Hinglish, and Bengali, balancing assertive versus hedged style and correct versus incorrect answers. A small pilot suggested a large pointwise penalty against hedged answers. After repairing multilingual templates and adding quality-control checks, this pointwise effect largely disappears: final pointwise accuracy is 99.8{\%}, with no meaningful assertive-hedged gap. The robust remaining effect is pairwise: when two answers are equally correct and differ only in style, the judge prefers the assertive answer in 1,276 of 1,280 cases. We interpret this as a protocol- and task-specific assertiveness preference, not as a universal bias against hedging. Our findings highlight benchmark auditing as a central requirement for multilingual judge-bias research."
}Markdown (Informal)
[Confidence as a Tie-Breaker: Reassessing Multilingual Hedging Bias in LLM-as-a-Judge Evaluation](https://preview.aclanthology.org/ingest-acl/2026.acl-srw.33/) (Datta & Baitalik, ACL 2026)
ACL