@inproceedings{junczys-dowmunt-2025-gemba,
title = "{GEMBA} V2: Ten Judgments Are Better Than One",
author = "Junczys-Dowmunt, Marcin",
editor = "Haddow, Barry and
Kocmi, Tom and
Koehn, Philipp and
Monz, Christof",
booktitle = "Proceedings of the Tenth Conference on Machine Translation",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-emnlp/2025.wmt-1.67/",
pages = "926--933",
ISBN = "979-8-89176-341-8",
abstract = "We introduce GEMBA-MQM V2, an MQM-inspired, reference-free LLM evaluation metric for the WMT25 Metrics Shared Task (Subtask 1). Building on GEMBA/GEMBA-MQM, we prompt GPT-4.1-mini to produce structured MQM error annotations per segment. We map annotations to scores with 25/5/1 severity weights (minor punctuation = 0.1). To reduce stochastic variance, each segment is scored ten times and aggregated with a reciprocal-rank weighted average (RRWA) after removing outliers beyond $2\sigma$. On the WMT24 MQM test sets, GEMBA-MQM V2 ranks first by average correlation, with strong results across languages and evaluation levels; WMT23 results show comparable performance."
}Markdown (Informal)
[GEMBA V2: Ten Judgments Are Better Than One](https://preview.aclanthology.org/ingest-emnlp/2025.wmt-1.67/) (Junczys-Dowmunt, WMT 2025)
ACL