@inproceedings{georgila-etal-2012-practical,
title = "Practical Evaluation of Human and Synthesized Speech for Virtual Human Dialogue Systems",
author = "Georgila, Kallirroi and
Black, Alan and
Sagae, Kenji and
Traum, David",
editor = "Calzolari, Nicoletta and
Choukri, Khalid and
Declerck, Thierry and
Do{\u{g}}an, Mehmet U{\u{g}}ur and
Maegaard, Bente and
Mariani, Joseph and
Moreno, Asuncion and
Odijk, Jan and
Piperidis, Stelios",
booktitle = "Proceedings of the Eighth International Conference on Language Resources and Evaluation ({LREC}'12)",
month = may,
year = "2012",
address = "Istanbul, Turkey",
publisher = "European Language Resources Association (ELRA)",
url = "https://preview.aclanthology.org/fix-sig-urls/L12-1318/",
pages = "3519--3526",
abstract = "The current practice in virtual human dialogue systems is to use professional human recordings or limited-domain speech synthesis. Both approaches lead to good performance but at a high cost. To determine the best trade-off between performance and cost, we perform a systematic evaluation of human and synthesized voices with regard to naturalness, conversational aspect, and likability. We vary the type (in-domain vs. out-of-domain), length, and content of utterances, and take into account the age and native language of raters as well as their familiarity with speech synthesis. We present detailed results from two studies, a pilot one and one run on Amazon's Mechanical Turk. Our results suggest that a professional human voice can supersede both an amateur human voice and synthesized voices. Also, a high-quality general-purpose voice or a good limited-domain voice can perform better than amateur human recordings. We do not find any significant differences between the performance of a high-quality general-purpose voice and a limited-domain voice, both trained with speech recorded by actors. As expected, the high-quality general-purpose voice is rated higher than the limited-domain voice for out-of-domain sentences and lower for in-domain sentences. There is also a trend for long or negative-content utterances to receive lower ratings."
}
Markdown (Informal)
[Practical Evaluation of Human and Synthesized Speech for Virtual Human Dialogue Systems](https://preview.aclanthology.org/fix-sig-urls/L12-1318/) (Georgila et al., LREC 2012)
ACL