@inproceedings{jeon-etal-2025-prompt,
title = "Prompt-Guided Selective Masking Loss for Context-Aware Emotive Text-to-Speech",
author = "Jeon, Yejin and
Kim, Youngjae and
Lee, Jihyun and
Lee, Gary",
editor = "Chiruzzo, Luis and
Ritter, Alan and
Wang, Lu",
booktitle = "Findings of the Association for Computational Linguistics: NAACL 2025",
month = apr,
year = "2025",
address = "Albuquerque, New Mexico",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/fix-sig-urls/2025.findings-naacl.38/",
pages = "638--650",
ISBN = "979-8-89176-195-7",
abstract = "Emotional dialogue speech synthesis (EDSS) aims to generate expressive speech by leveraging the dialogue context between interlocutors. This is typically done by concatenating global representations of previous utterances as conditions for text-to-speech (TTS) systems. However, such approaches overlook the importance of integrating localized acoustic cues that convey emotion. To address this, we introduce a novel approach that utilizes a large language model (LLM) to generate holistic emotion tags based on prior dialogue context, while also pinpointing key words in the target utterance that align with the predicted emotional state. Furthermore, we enhance the emotional richness of synthesized speech by incorporating concentrated acoustic features of these key words through a novel selective audio masking loss function. This methodology not only improves emotional expressiveness, but also facilitates automatic emotion speech generation during inference by eliminating the need for manual emotion tag selection. Comprehensive subjective and objective evaluations and analyses demonstrate the effectiveness of the proposed approach."
}
Markdown (Informal)
[Prompt-Guided Selective Masking Loss for Context-Aware Emotive Text-to-Speech](https://preview.aclanthology.org/fix-sig-urls/2025.findings-naacl.38/) (Jeon et al., Findings 2025)
ACL