@inproceedings{kim-2025-rubric,
title = "{RUBRIC}-{MQM} : Span-Level {LLM}-as-judge in Machine Translation For High-End Models",
author = "Kim, Ahrii",
editor = "Rehm, Georg and
Li, Yunyao",
booktitle = "Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 6: Industry Track)",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/mtsummit-25-ingestion/2025.acl-industry.12/",
doi = "10.18653/v1/2025.acl-industry.12",
pages = "147--165",
ISBN = "979-8-89176-288-6",
abstract = "Referred to as $\textit{LLM-as-judge}$, a generative large language model (LLM) has demonstrated considerable efficacy as an evaluator in various tasks, including Machine Translation (LAJ-MT) by predicting scores or identifying error types for individual sentences. However, its dependability in practical application has yet to be demonstrated, as there is only an $\textit{approximated match}$ due to the task{'}s open-ended nature. To address this problem, we introduce a straightforward and novel meta-evaluation strategy $\textbf{PromptCUE}$ and evaluate cutting-edge LAJ-MT models such as GEMBA-MQM. We identify their fundamental deficits, including certain label biases and the inability to assess near-perfect translations.To improve reliability, we investigate more trustworthy and less biased models using multidimensional prompt engineering. Our findings indicate that the combination of span-level error quantification and a rubric-style prompt tailored to the characteristics of LLMs has efficiently addressed the majority of the challenges current LAJ-MT models face. Furthermore, it demonstrates a considerably enhanced alignment with human values. Accordingly, we present $\textbf{Rubric-MQM}$, the LAJ-MT for high-end models and an updated version of GEMBA-MQM."
}
Markdown (Informal)
[RUBRIC-MQM : Span-Level LLM-as-judge in Machine Translation For High-End Models](https://preview.aclanthology.org/mtsummit-25-ingestion/2025.acl-industry.12/) (Kim, ACL 2025)
ACL