@inproceedings{liang-etal-2020-beyond,
title = "Beyond User Self-Reported {L}ikert Scale Ratings: A Comparison Model for Automatic Dialog Evaluation",
author = "Liang, Weixin and
Zou, James and
Yu, Zhou",
editor = "Jurafsky, Dan and
Chai, Joyce and
Schluter, Natalie and
Tetreault, Joel",
booktitle = "Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics",
month = jul,
year = "2020",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/Author-page-Marten-During-lu/2020.acl-main.126/",
doi = "10.18653/v1/2020.acl-main.126",
pages = "1363--1374",
abstract = "Open Domain dialog system evaluation is one of the most important challenges in dialog research. Existing automatic evaluation metrics, such as BLEU are mostly reference-based. They calculate the difference between the generated response and a limited number of available references. Likert-score based self-reported user rating is widely adopted by social conversational systems, such as Amazon Alexa Prize chatbots. However, self-reported user rating suffers from bias and variance among different users. To alleviate this problem, we formulate dialog evaluation as a comparison task. We also propose an automatic evaluation model CMADE (Comparison Model for Automatic Dialog Evaluation) that automatically cleans self-reported user ratings as it trains on them. Specifically, we first use a self-supervised method to learn better dialog feature representation, and then use KNN and Shapley to remove confusing samples. Our experiments show that CMADE achieves 89.2{\%} accuracy in the dialog comparison task."
}
Markdown (Informal)
[Beyond User Self-Reported Likert Scale Ratings: A Comparison Model for Automatic Dialog Evaluation](https://preview.aclanthology.org/Author-page-Marten-During-lu/2020.acl-main.126/) (Liang et al., ACL 2020)
ACL