@inproceedings{song-etal-2025-enhancing,
title = "Enhancing Human Evaluation in Machine Translation with Comparative Judgement",
author = "Song, Yixiao and
Riley, Parker and
Deutsch, Daniel and
Freitag, Markus",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingestion-acl-25/2025.acl-long.1002/",
pages = "20536--20551",
ISBN = "979-8-89176-251-0",
abstract = "Human evaluation is crucial for assessing rapidly evolving language models but is influenced by annotator proficiency and task design. This study explores the integration of comparative judgment into human annotation for machine translation (MT) and evaluates three annotation setups{---}point-wise Multidimensional Quality Metrics (MQM), side-by-side (S{\texttimes}S) MQM, and its simplified version S{\texttimes}S relative ranking (RR). In MQM, annotators mark error spans with categories and severity levels. S{\texttimes}S MQM extends MQM to pairwise error annotation for two translations of the same input, while S{\texttimes}S RR focuses on selecting the better output without labeling errors.Key findings are: (1) the S{\texttimes}S settings achieve higher inter-annotator agreement than MQM; (2) S{\texttimes}S MQM enhances inter-translation error marking consistency compared to MQM by, on average, 38.5{\%} for explicitly compared MT systems and 19.5{\%} for others; (3) all annotation settings return stable system rankings, with S{\texttimes}S RR offering a more efficient alternative to (S{\texttimes}S) MQM; (4) the S{\texttimes}S settings highlight subtle errors overlooked in MQM without altering absolute system evaluations.To spur further research, we will release the triply annotated datasets comprising 377 ZhEn and 104 EnDe annotation examples, each covering 10 systems."
}
Markdown (Informal)
[Enhancing Human Evaluation in Machine Translation with Comparative Judgement](https://preview.aclanthology.org/ingestion-acl-25/2025.acl-long.1002/) (Song et al., ACL 2025)
ACL