@inproceedings{bremerman-etal-2020-evaluation,
title = "On the Evaluation of Machine Translation n-best Lists",
author = "Bremerman, Jacob and
Khayrallah, Huda and
Oard, Douglas and
Post, Matt",
editor = "Eger, Steffen and
Gao, Yang and
Peyrard, Maxime and
Zhao, Wei and
Hovy, Eduard",
booktitle = "Proceedings of the First Workshop on Evaluation and Comparison of NLP Systems",
month = nov,
year = "2020",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/fix-sig-urls/2020.eval4nlp-1.7/",
doi = "10.18653/v1/2020.eval4nlp-1.7",
pages = "60--68",
abstract = "The standard machine translation evaluation framework measures the single-best output of machine translation systems. There are, however, many situations where n-best lists are needed, yet there is no established way of evaluating them. This paper establishes a framework for addressing n-best evaluation by outlining three different questions one could consider when determining how one would define a `good' n-best list and proposing evaluation measures for each question. The first and principal contribution is an evaluation measure that characterizes the translation quality of an entire n-best list by asking whether many of the valid translations are placed near the top of the list. The second is a measure that uses gold translations with preference annotations to ask to what degree systems can produce ranked lists in preference order. The third is a measure that rewards partial matches, evaluating the closeness of the many items in an n-best list to a set of many valid references. These three perspectives make clear that having access to many references can be useful when n-best evaluation is the goal."
}
Markdown (Informal)
[On the Evaluation of Machine Translation n-best Lists](https://preview.aclanthology.org/fix-sig-urls/2020.eval4nlp-1.7/) (Bremerman et al., Eval4NLP 2020)
ACL
- Jacob Bremerman, Huda Khayrallah, Douglas Oard, and Matt Post. 2020. On the Evaluation of Machine Translation n-best Lists. In Proceedings of the First Workshop on Evaluation and Comparison of NLP Systems, pages 60–68, Online. Association for Computational Linguistics.