@inproceedings{lykartsis-kotti-2019-prediction,
title = "Prediction of User Emotion and Dialogue Success Using Audio Spectrograms and Convolutional Neural Networks",
author = "Lykartsis, Athanasios and
Kotti, Margarita",
editor = "Nakamura, Satoshi and
Gasic, Milica and
Zukerman, Ingrid and
Skantze, Gabriel and
Nakano, Mikio and
Papangelis, Alexandros and
Ultes, Stefan and
Yoshino, Koichiro",
booktitle = "Proceedings of the 20th Annual SIGdial Meeting on Discourse and Dialogue",
month = sep,
year = "2019",
address = "Stockholm, Sweden",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/jlcl-multiple-ingestion/W19-5939/",
doi = "10.18653/v1/W19-5939",
pages = "336--344",
abstract = "In this paper we aim to predict dialogue success and user satisfaction as well as emotion on a turn level. To achieve this, we investigate the use of spectrogram representations, extracted from audio files, in combination with several types of convolutional neural networks. The experiments were performed on the Let`s Go V2 database, comprising 5065 audio files and having labels for subjective and objective dialogue turn success, as well as the emotional state of the user. Results show that by using only audio, it is possible to predict turn success with very high accuracy for all three labels (90{\%}). The best performing input representation were 1s long mel-spectrograms in combination with a CNN with a bottleneck architecture. The resulting system has the potential to be used real-time. Our results significantly surpass the state of the art for dialogue success prediction based only on audio."
}
Markdown (Informal)
[Prediction of User Emotion and Dialogue Success Using Audio Spectrograms and Convolutional Neural Networks](https://preview.aclanthology.org/jlcl-multiple-ingestion/W19-5939/) (Lykartsis & Kotti, SIGDIAL 2019)
ACL