@inproceedings{finch-choi-2020-towards,
title = "Towards Unified Dialogue System Evaluation: A Comprehensive Analysis of Current Evaluation Protocols",
author = "Finch, Sarah E. and
Choi, Jinho D.",
editor = "Pietquin, Olivier and
Muresan, Smaranda and
Chen, Vivian and
Kennington, Casey and
Vandyke, David and
Dethlefs, Nina and
Inoue, Koji and
Ekstedt, Erik and
Ultes, Stefan",
booktitle = "Proceedings of the 21th Annual Meeting of the Special Interest Group on Discourse and Dialogue",
month = jul,
year = "2020",
address = "1st virtual meeting",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/fix-sig-urls/2020.sigdial-1.29/",
doi = "10.18653/v1/2020.sigdial-1.29",
pages = "236--245",
abstract = "As conversational AI-based dialogue management has increasingly become a trending topic, the need for a standardized and reliable evaluation procedure grows even more pressing. The current state of affairs suggests various evaluation protocols to assess chat-oriented dialogue management systems, rendering it difficult to conduct fair comparative studies across different approaches and gain an insightful understanding of their values. To foster this research, a more robust evaluation protocol must be set in place. This paper presents a comprehensive synthesis of both automated and human evaluation methods on dialogue systems, identifying their shortcomings while accumulating evidence towards the most effective evaluation dimensions. A total of 20 papers from the last two years are surveyed to analyze three types of evaluation protocols: automated, static, and interactive. Finally, the evaluation dimensions used in these papers are compared against our expert evaluation on the system-user dialogue data collected from the Alexa Prize 2020."
}
Markdown (Informal)
[Towards Unified Dialogue System Evaluation: A Comprehensive Analysis of Current Evaluation Protocols](https://preview.aclanthology.org/fix-sig-urls/2020.sigdial-1.29/) (Finch & Choi, SIGDIAL 2020)
ACL