@inproceedings{nekvinda-dusek-2021-shades,
title = "Shades of {BLEU}, Flavours of Success: The Case of {M}ulti{WOZ}",
author = "Nekvinda, Tom{\'a}{\v{s}} and
Du{\v{s}}ek, Ond{\v{r}}ej",
editor = "Bosselut, Antoine and
Durmus, Esin and
Gangal, Varun Prashant and
Gehrmann, Sebastian and
Jernite, Yacine and
Perez-Beltrachini, Laura and
Shaikh, Samira and
Xu, Wei",
booktitle = "Proceedings of the 1st Workshop on Natural Language Generation, Evaluation, and Metrics (GEM 2021)",
month = aug,
year = "2021",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/jlcl-multiple-ingestion/2021.gem-1.4/",
doi = "10.18653/v1/2021.gem-1.4",
pages = "34--46",
abstract = "The MultiWOZ dataset (Budzianowski et al.,2018) is frequently used for benchmarkingcontext-to-response abilities of task-orienteddialogue systems. In this work, we identifyinconsistencies in data preprocessing and re-porting of three corpus-based metrics used onthis dataset, i.e., BLEU score and Inform {\&}Success rates. We point out a few problemsof the MultiWOZ benchmark such as unsat-isfactory preprocessing, insufficient or under-specified evaluation metrics, or rigid database. We re-evaluate 7 end-to-end and 6 policy opti-mization models in as-fair-as-possible setups,and we show that their reported scores cannotbe directly compared. To facilitate compari-son of future systems, we release our stand-alone standardized evaluation scripts. We alsogive basic recommendations for corpus-basedbenchmarking in future works."
}
Markdown (Informal)
[Shades of BLEU, Flavours of Success: The Case of MultiWOZ](https://preview.aclanthology.org/jlcl-multiple-ingestion/2021.gem-1.4/) (Nekvinda & Dušek, GEM 2021)
ACL