@inproceedings{khojah-etal-2022-evaluating,
title = "Evaluating N-best Calibration of Natural Language Understanding for Dialogue Systems",
author = "Khojah, Ranim and
Berman, Alexander and
Larsson, Staffan",
editor = "Lemon, Oliver and
Hakkani-Tur, Dilek and
Li, Junyi Jessy and
Ashrafzadeh, Arash and
Garcia, Daniel Hern{\'a}ndez and
Alikhani, Malihe and
Vandyke, David and
Du{\v{s}}ek, Ond{\v{r}}ej",
booktitle = "Proceedings of the 23rd Annual Meeting of the Special Interest Group on Discourse and Dialogue",
month = sep,
year = "2022",
address = "Edinburgh, UK",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/add-emnlp-2024-awards/2022.sigdial-1.54/",
doi = "10.18653/v1/2022.sigdial-1.54",
pages = "582--594",
abstract = "A Natural Language Understanding (NLU) component can be used in a dialogue system to perform intent classification, returning an N-best list of hypotheses with corresponding confidence estimates. We perform an in-depth evaluation of 5 NLUs, focusing on confidence estimation. We measure and visualize calibration for the 10 best hypotheses on model level and rank level, and also measure classification performance. The results indicate a trade-off between calibration and performance. In particular, Rasa (with Sklearn classifier) had the best calibration but the lowest performance scores, while Watson Assistant had the best performance but a poor calibration."
}
Markdown (Informal)
[Evaluating N-best Calibration of Natural Language Understanding for Dialogue Systems](https://preview.aclanthology.org/add-emnlp-2024-awards/2022.sigdial-1.54/) (Khojah et al., SIGDIAL 2022)
ACL