@inproceedings{babych-etal-2005-estimating,
title = "Estimating the predictive Power of N-gram {MT} Evaluation Metrics across Language and Text Types",
author = "Babych, Bogdan and
Hartley, Anthony and
Elliott, Debbie",
booktitle = "Proceedings of Machine Translation Summit X: Posters",
month = sep # " 13-15",
year = "2005",
address = "Phuket, Thailand",
url = "https://preview.aclanthology.org/fix-sig-urls/2005.mtsummit-posters.13/",
pages = "412--418",
abstract = "The use of n-gram metrics to evaluate the output of MT systems is widespread. Typically, they are used in system development, where an increase in the score is taken to represent an improvement in the output of the system. However, purchasers of MT systems or services are more concerned to know how well a score predicts the acceptability of the output to a reader-user. Moreover, they usually want to know if these predictions will hold across a range of target languages and text types. We describe an experiment involving human and automated evaluations of four MT systems across two text types and 23 language directions. It establishes that the correlation between human and automated scores is high, but that the predictive power of these scores depends crucially on target language and text type."
}
Markdown (Informal)
[Estimating the predictive Power of N-gram MT Evaluation Metrics across Language and Text Types](https://preview.aclanthology.org/fix-sig-urls/2005.mtsummit-posters.13/) (Babych et al., MTSummit 2005)
ACL