@inproceedings{graham-etal-2016-glitters,
title = "Is all that Glitters in Machine Translation Quality Estimation really Gold?",
author = "Graham, Yvette and
Baldwin, Timothy and
Dowling, Meghan and
Eskevich, Maria and
Lynn, Teresa and
Tounsi, Lamia",
booktitle = "Proceedings of {COLING} 2016, the 26th International Conference on Computational Linguistics: Technical Papers",
month = dec,
year = "2016",
address = "Osaka, Japan",
publisher = "The COLING 2016 Organizing Committee",
url = "https://aclanthology.org/C16-1294",
pages = "3124--3134",
abstract = "Human-targeted metrics provide a compromise between human evaluation of machine translation, where high inter-annotator agreement is difficult to achieve, and fully automatic metrics, such as BLEU or TER, that lack the validity of human assessment. Human-targeted translation edit rate (HTER) is by far the most widely employed human-targeted metric in machine translation, commonly employed, for example, as a gold standard in evaluation of quality estimation. Original experiments justifying the design of HTER, as opposed to other possible formulations, were limited to a small sample of translations and a single language pair, however, and this motivates our re-evaluation of a range of human-targeted metrics on a substantially larger scale. Results show significantly stronger correlation with human judgment for HBLEU over HTER for two of the nine language pairs we include and no significant difference between correlations achieved by HTER and HBLEU for the remaining language pairs. Finally, we evaluate a range of quality estimation systems employing HTER and direct assessment (DA) of translation adequacy as gold labels, resulting in a divergence in system rankings, and propose employment of DA for future quality estimation evaluations.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="graham-etal-2016-glitters">
<titleInfo>
<title>Is all that Glitters in Machine Translation Quality Estimation really Gold?</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yvette</namePart>
<namePart type="family">Graham</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Timothy</namePart>
<namePart type="family">Baldwin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Meghan</namePart>
<namePart type="family">Dowling</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Eskevich</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Teresa</namePart>
<namePart type="family">Lynn</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lamia</namePart>
<namePart type="family">Tounsi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2016-dec</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of COLING 2016, the 26th International Conference on Computational Linguistics: Technical Papers</title>
</titleInfo>
<originInfo>
<publisher>The COLING 2016 Organizing Committee</publisher>
<place>
<placeTerm type="text">Osaka, Japan</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Human-targeted metrics provide a compromise between human evaluation of machine translation, where high inter-annotator agreement is difficult to achieve, and fully automatic metrics, such as BLEU or TER, that lack the validity of human assessment. Human-targeted translation edit rate (HTER) is by far the most widely employed human-targeted metric in machine translation, commonly employed, for example, as a gold standard in evaluation of quality estimation. Original experiments justifying the design of HTER, as opposed to other possible formulations, were limited to a small sample of translations and a single language pair, however, and this motivates our re-evaluation of a range of human-targeted metrics on a substantially larger scale. Results show significantly stronger correlation with human judgment for HBLEU over HTER for two of the nine language pairs we include and no significant difference between correlations achieved by HTER and HBLEU for the remaining language pairs. Finally, we evaluate a range of quality estimation systems employing HTER and direct assessment (DA) of translation adequacy as gold labels, resulting in a divergence in system rankings, and propose employment of DA for future quality estimation evaluations.</abstract>
<identifier type="citekey">graham-etal-2016-glitters</identifier>
<location>
<url>https://aclanthology.org/C16-1294</url>
</location>
<part>
<date>2016-dec</date>
<extent unit="page">
<start>3124</start>
<end>3134</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Is all that Glitters in Machine Translation Quality Estimation really Gold?
%A Graham, Yvette
%A Baldwin, Timothy
%A Dowling, Meghan
%A Eskevich, Maria
%A Lynn, Teresa
%A Tounsi, Lamia
%S Proceedings of COLING 2016, the 26th International Conference on Computational Linguistics: Technical Papers
%D 2016
%8 dec
%I The COLING 2016 Organizing Committee
%C Osaka, Japan
%F graham-etal-2016-glitters
%X Human-targeted metrics provide a compromise between human evaluation of machine translation, where high inter-annotator agreement is difficult to achieve, and fully automatic metrics, such as BLEU or TER, that lack the validity of human assessment. Human-targeted translation edit rate (HTER) is by far the most widely employed human-targeted metric in machine translation, commonly employed, for example, as a gold standard in evaluation of quality estimation. Original experiments justifying the design of HTER, as opposed to other possible formulations, were limited to a small sample of translations and a single language pair, however, and this motivates our re-evaluation of a range of human-targeted metrics on a substantially larger scale. Results show significantly stronger correlation with human judgment for HBLEU over HTER for two of the nine language pairs we include and no significant difference between correlations achieved by HTER and HBLEU for the remaining language pairs. Finally, we evaluate a range of quality estimation systems employing HTER and direct assessment (DA) of translation adequacy as gold labels, resulting in a divergence in system rankings, and propose employment of DA for future quality estimation evaluations.
%U https://aclanthology.org/C16-1294
%P 3124-3134
Markdown (Informal)
[Is all that Glitters in Machine Translation Quality Estimation really Gold?](https://aclanthology.org/C16-1294) (Graham et al., COLING 2016)
ACL