@inproceedings{wang-etal-2025-prosodyflow,
title = "{P}rosody{F}low: High-fidelity Text-to-Speech through Conditional Flow Matching and Prosody Modeling with Large Speech Language Models",
author = "Wang, Haoyu and
Shan, Sizhe and
Guo, Yinlin and
Wang, Yuehai",
editor = "Rambow, Owen and
Wanner, Leo and
Apidianaki, Marianna and
Al-Khalifa, Hend and
Eugenio, Barbara Di and
Schockaert, Steven",
booktitle = "Proceedings of the 31st International Conference on Computational Linguistics",
month = jan,
year = "2025",
address = "Abu Dhabi, UAE",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest_wac_2008/2025.coling-main.518/",
pages = "7748--7753",
abstract = "Text-to-speech (TTS) has seen significant advancements in high-quality, expressive speech synthesis. However, achieving diverse and natural prosody in synthesized speech remains challenging. In this paper, we propose ProsodyFlow, an end-to-end TTS model that integrates large self-supervised speech models and conditional flow matching to model prosodic features effectively. Our approach involves using a speech LLM to extract acoustic features, mapping these features into a prosody latent space, and then employing conditional flow matching to generate prosodic vectors conditioned on the input text. Experiments on the LJSpeech dataset show that ProsodyFlow improves synthesis quality and efficiency compared to existing models, achieving more prosodic and expressive speech synthesizing."
}
Markdown (Informal)
[ProsodyFlow: High-fidelity Text-to-Speech through Conditional Flow Matching and Prosody Modeling with Large Speech Language Models](https://preview.aclanthology.org/ingest_wac_2008/2025.coling-main.518/) (Wang et al., COLING 2025)
ACL