@inproceedings{bak-etal-2024-multiverse,
title = "{M}ulti{V}erse: Efficient and Expressive Zero-Shot Multi-Task Text-to-Speech",
author = "Bak, Taejun and
Eom, Youngsik and
Choi, SeungJae and
Joo, Young-Sun",
editor = "Al-Onaizan, Yaser and
Bansal, Mohit and
Chen, Yun-Nung",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2024",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/fix-sig-urls/2024.findings-emnlp.533/",
doi = "10.18653/v1/2024.findings-emnlp.533",
pages = "9130--9147",
abstract = "Text-to-speech (TTS) systems that scale up the amount of training data have achieved significant improvements in zero-shot speech synthesis. However, these systems have certain limitations: they require a large amount of training data, which increases costs, and often overlook prosody similarity. To address these issues, we propose MultiVerse, a zero-shot multi-task TTS system that is able to perform TTS or speech style transfer in zero-shot and cross-lingual conditions. MultiVerse requires much less training data than traditional data-driven approaches. To ensure zero-shot performance even with limited data, we leverage source-filter theory-based disentanglement, utilizing the prompt for modeling filter-related and source-related representations. Additionally, to further enhance prosody similarity, we adopt a prosody modeling approach combining prompt-based autoregressive and non-autoregressive methods. Evaluations demonstrate the remarkable zero-shot multi-task TTS performance of MultiVerse and show that MultiVerse not only achieves zero-shot TTS performance comparable to data-driven TTS systems with much less data, but also significantly outperforms other zero-shot TTS systems trained with the same small amount of data. In particular, our novel prosody modeling technique significantly contributes to MultiVerse{'}s ability to generate speech with high prosody similarity to the given prompts."
}
Markdown (Informal)
[MultiVerse: Efficient and Expressive Zero-Shot Multi-Task Text-to-Speech](https://preview.aclanthology.org/fix-sig-urls/2024.findings-emnlp.533/) (Bak et al., Findings 2024)
ACL