@inproceedings{siro-etal-2026-learning,
title = "Learning to Judge: {LLM}s Designing and Applying Evaluation Rubrics",
author = "Siro, Clemencia and
Aliannejadi, Pourya and
Aliannejadi, Mohammad",
editor = "Demberg, Vera and
Inui, Kentaro and
Marquez, Llu{\'i}s",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {EACL} 2026",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-eacl/2026.findings-eacl.335/",
pages = "6371--6389",
ISBN = "979-8-89176-386-9",
abstract = "Large language models (LLMs) are increasingly used as evaluators for natural language generation, applying human-defined rubrics to assess system outputs. However, human rubrics are often static and misaligned with how models internally represent language quality. We introduce GER-Eval (Generating Evaluation Rubrics for Evaluation) to investigate whether LLMs can design and use their own evaluation rubrics. We evaluate the semantic coherence and scoring reliability of LLM-defined criteria and their alignment with human criteria. LLMs reliably generate interpretable and task-aware evaluation dimensions and apply them within models, but their scoring reliability degrades in factual and knowledge-intensive settings. Closed-source models such as GPT-4o achieve higher agreement and cross-model generalization than open-weight models such as Llama. Our findings position evaluation as a learned linguistic capability of LLMs{---}consistent within models but fragmented across them{---}and call for new methods that jointly model human and LLM evaluative language to improve reliability and interpretability."
}Markdown (Informal)
[Learning to Judge: LLMs Designing and Applying Evaluation Rubrics](https://preview.aclanthology.org/ingest-eacl/2026.findings-eacl.335/) (Siro et al., Findings 2026)
ACL