@inproceedings{hayati-etal-2026-rubrics,
title = "From Rubrics to Recipe: Principle-Centric Benchmark for Evaluating Large Language Models",
author = "Hayati, Shirley Anugrah and
Wang, Ruizi and
Kang, Dongyeop",
editor = "Akhtar, Mubashara and
Batzner, Jan and
Choshen, Leshem and
Ghosh, Avijit and
Gohar, Usman and
Mickel, Jennifer and
Pant, Ichhya and
Talat, Zeerak and
Lin, Michelle",
booktitle = "Proceedings of the Workshop on Evaluating Evaluations ({E}val{E}val)",
month = jul,
year = "2026",
address = "San Diego, CA",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl-workshops/2026.evaleval-1.15/",
pages = "82--99",
ISBN = "979-8-89176-429-3",
abstract = "Large language models (LLMs) are often evaluated on benchmarks that rely on surfacelevel instructions, obscuring what defines highquality performance. We argue that tasks can be more precisely characterized through principles: human-readable rules that specify what matters for a good response to the task. Our study proposes a framework to automatically extract and generate task-level principles for data generation and evaluation. Using this approach, we build a benchmark of over 20K principle-aligned instances, enabling controllable data creation and fine-grained, interpretable assessment of LLMs. Experiments show that principles both improve output quality and scale evaluation beyond manual curation, offering a new recipe for principled assessment of LLM capabilities.1"
}Markdown (Informal)
[From Rubrics to Recipe: Principle-Centric Benchmark for Evaluating Large Language Models](https://preview.aclanthology.org/ingest-acl-workshops/2026.evaleval-1.15/) (Hayati et al., EvalEval 2026)
ACL