@inproceedings{tsuta-etal-2020-ubleu,
title = "u{BLEU}: Uncertainty-Aware Automatic Evaluation Method for Open-Domain Dialogue Systems",
author = "Tsuta, Yuma and
Yoshinaga, Naoki and
Toyoda, Masashi",
editor = "Rijhwani, Shruti and
Liu, Jiangming and
Wang, Yizhong and
Dror, Rotem",
booktitle = "Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics: Student Research Workshop",
month = jul,
year = "2020",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/jlcl-multiple-ingestion/2020.acl-srw.27/",
doi = "10.18653/v1/2020.acl-srw.27",
pages = "199--206",
abstract = "Because open-domain dialogues allow diverse responses, basic reference-based metrics such as BLEU do not work well unless we prepare a massive reference set of high-quality responses for input utterances. To reduce this burden, a human-aided, uncertainty-aware metric, {\ensuremath{\Delta}}BLEU, has been proposed; it embeds human judgment on the quality of reference outputs into the computation of multiple-reference BLEU. In this study, we instead propose a fully automatic, uncertainty-aware evaluation method for open-domain dialogue systems, {\ensuremath{\upsilon}}BLEU. This method first collects diverse reference responses from massive dialogue data and then annotates their quality judgments by using a neural network trained on automatically collected training data. Experimental results on massive Twitter data confirmed that {\ensuremath{\upsilon}}BLEU is comparable to {\ensuremath{\Delta}}BLEU in terms of its correlation with human judgment and that the state of the art automatic evaluation method, RUBER, is improved by integrating {\ensuremath{\upsilon}}BLEU."
}
Markdown (Informal)
[uBLEU: Uncertainty-Aware Automatic Evaluation Method for Open-Domain Dialogue Systems](https://preview.aclanthology.org/jlcl-multiple-ingestion/2020.acl-srw.27/) (Tsuta et al., ACL 2020)
ACL