@inproceedings{kim-2025-context,
title = "Context Is Ubiquitous, but Rarely Changes Judgments: Revisiting Document-Level {MT} Evaluation",
author = "Kim, Ahrii",
editor = "Haddow, Barry and
Kocmi, Tom and
Koehn, Philipp and
Monz, Christof",
booktitle = "Proceedings of the Tenth Conference on Machine Translation",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/name-variant-enfa-fane/2025.wmt-1.5/",
doi = "10.18653/v1/2025.wmt-1.5",
pages = "81--97",
ISBN = "979-8-89176-341-8",
abstract = "As sentence-level performance in modern Machine Translation (MT) has plateaued, reliable document-level evaluation is increasingly needed. While the recent FALCON framework with pragmatic features offers a promising direction, its reliability and reproducibility are unclear. We address this gap through human evaluation, analyzing sources of low inter-annotator agreement and identifying key factors. Based on these findings, we introduce H-FALCON, a Human-centered refinement of FALCON. Our experiments show that, even with limited annotator consensus, FALCON achieves correlations comparable to or better than standard sentence-level protocols.Furthermore, we find that contextual information is inherent in all sentences, challenging the view that only some require it. This suggests that prior estimates such as ``n{\%} of sentences require context'' may stem from methodological artifacts. At the same time, we show that while context is pervasive, not all of it directly influences human judgment."
}Markdown (Informal)
[Context Is Ubiquitous, but Rarely Changes Judgments: Revisiting Document-Level MT Evaluation](https://preview.aclanthology.org/name-variant-enfa-fane/2025.wmt-1.5/) (Kim, WMT 2025)
ACL