@inproceedings{durmus-etal-2022-spurious,
title = "Spurious Correlations in Reference-Free Evaluation of Text Generation",
author = "Durmus, Esin and
Ladhak, Faisal and
Hashimoto, Tatsunori",
editor = "Muresan, Smaranda and
Nakov, Preslav and
Villavicencio, Aline",
booktitle = "Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = may,
year = "2022",
address = "Dublin, Ireland",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/jlcl-multiple-ingestion/2022.acl-long.102/",
doi = "10.18653/v1/2022.acl-long.102",
pages = "1443--1454",
abstract = "Model-based, reference-free evaluation metricshave been proposed as a fast and cost-effectiveapproach to evaluate Natural Language Generation(NLG) systems. Despite promising recentresults, we find evidence that reference-freeevaluation metrics of summarization and dialoggeneration may be relying on spuriouscorrelations with measures such as word overlap,perplexity, and length. We further observethat for text summarization, these metrics havehigh error rates when ranking current state-ofthe-art abstractive summarization systems. Wedemonstrate that these errors can be mitigatedby explicitly designing evaluation metrics toavoid spurious features in reference-free evaluation."
}
Markdown (Informal)
[Spurious Correlations in Reference-Free Evaluation of Text Generation](https://preview.aclanthology.org/jlcl-multiple-ingestion/2022.acl-long.102/) (Durmus et al., ACL 2022)
ACL