@inproceedings{lo-etal-2023-beyond,
title = "Beyond Correlation: Making Sense of the Score Differences of New {MT} Evaluation Metrics",
author = "Lo, Chi-kiu and
Knowles, Rebecca and
Goutte, Cyril",
editor = "Utiyama, Masao and
Wang, Rui",
booktitle = "Proceedings of Machine Translation Summit XIX, Vol. 1: Research Track",
month = sep,
year = "2023",
address = "Macau SAR, China",
publisher = "Asia-Pacific Association for Machine Translation",
url = "https://preview.aclanthology.org/fix-sig-urls/2023.mtsummit-research.16/",
pages = "186--199",
abstract = "While many new automatic metrics for machine translation evaluation have been proposed in recent years, BLEU scores are still used as the primary metric in the vast majority of MT research papers. There are many reasons that researchers may be reluctant to switch to new metrics, from external pressures (reviewers, prior work) to the ease of use of metric toolkits. Another reason is a lack of intuition about the meaning of novel metric scores. In this work, we examine ``rules of thumb'' about metric score differences and how they do (and do not) correspond to human judgments of statistically significant differences between systems. In particular, we show that common rules of thumb about BLEU score differences do not in fact guarantee that human annotators will find significant differences between systems. We also show ways in which these rules of thumb fail to generalize across translation directions or domains."
}
Markdown (Informal)
[Beyond Correlation: Making Sense of the Score Differences of New MT Evaluation Metrics](https://preview.aclanthology.org/fix-sig-urls/2023.mtsummit-research.16/) (Lo et al., MTSummit 2023)
ACL