@inproceedings{glushkova-etal-2023-bleu,
title = "{BLEU} Meets {COMET}: Combining Lexical and Neural Metrics Towards Robust Machine Translation Evaluation",
author = "Glushkova, Taisiya and
Zerva, Chrysoula and
Martins, Andr{\'e} F. T.",
editor = "Nurminen, Mary and
Brenner, Judith and
Koponen, Maarit and
Latomaa, Sirkku and
Mikhailov, Mikhail and
Schierl, Frederike and
Ranasinghe, Tharindu and
Vanmassenhove, Eva and
Vidal, Sergi Alvarez and
Aranberri, Nora and
Nunziatini, Mara and
Escart{\'i}n, Carla Parra and
Forcada, Mikel and
Popovic, Maja and
Scarton, Carolina and
Moniz, Helena",
booktitle = "Proceedings of the 24th Annual Conference of the European Association for Machine Translation",
month = jun,
year = "2023",
address = "Tampere, Finland",
publisher = "European Association for Machine Translation",
url = "https://preview.aclanthology.org/fix-sig-urls/2023.eamt-1.6/",
pages = "47--58",
abstract = "Although neural-based machine translation evaluation metrics, such as COMET or BLEURT, have achieved strong correlations with human judgements, they are sometimes unreliable in detecting certain phenomena that can be considered as critical errors, such as deviations in entities and numbers. In contrast, traditional evaluation metrics such as BLEU or chrF, which measure lexical or character overlap between translation hypotheses and human references, have lower correlations with human judgements but are sensitive to such deviations. In this paper, we investigate several ways of combining the two approaches in order to increase robustness of state-of-the-art evaluation methods to translations with critical errors. We show that by using additional information during training, such as sentence-level features and word-level tags, the trained metrics improve their capability to penalize translations with specific troublesome phenomena, which leads to gains in correlations with humans and on the recent DEMETR benchmark on several language pairs."
}