@inproceedings{bhat-varma-2026-prompts,
title = "All Prompts Are Created Equal? Evaluating Robustness of {LLM} Judges Against Non-Adversarial Prompt Variations",
author = "Bhat, Savita and
Varma, Vasudeva",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {ACL} 2026",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl-workshops/2026.findings-acl.1929/",
pages = "38730--38745",
ISBN = "979-8-89176-395-1",
abstract = "LLM-based evaluation systems (LLM judges) have emerged as a scalable alternative to expensive human evaluations. Although LLM judges demonstrate 70-80{\%} agreement with human evaluators, their robustness under semantically equivalent prompt variations remains underexplored. Through systematic evaluation of 8 models across 4 NLG tasks using 10 semantically equivalent paraphrases per prompt ({\textasciitilde}115000 evaluations), we identify a critical accuracy-robustness gap: attribute verifiability affects the robustness more than model choice, with factually verifiable attributes achieving 0.71 accuracy versus 0.19 for subjective attributes. Our investigations discover three key insights: 1) Task structure characteristics influence the robustness and in turn accuracy, 2) Attribute verifiability as the strongest predictor-factually verifiable attribute achieve 0.71 accuracy versus 0.19 for subjective attributes, 3) No single winning model-smallest model (Llama-3.1-8B) exhibits second-best performance, while the strongest model (Llama-4) from the same family significantly lag behind, thus demonstrating that general capability improvements do not necessarily result in evaluation robustness. With these findings, we propose a diagnostic framework grounded in attribute verifiability that enables principled decisions about evaluation automation. Our work establishes new standards for assessing LLM judge reliability beyond simple accuracy metrics."
}Markdown (Informal)
[All Prompts Are Created Equal? Evaluating Robustness of LLM Judges Against Non-Adversarial Prompt Variations](https://preview.aclanthology.org/ingest-acl-workshops/2026.findings-acl.1929/) (Bhat & Varma, Findings 2026)
ACL