@inproceedings{hu-etal-2025-chain,
title = "Chain-Talker: Chain Understanding and Rendering for Empathetic Conversational Speech Synthesis",
author = "Hu, Yifan and
Liu, Rui and
Ren, Yi and
Yin, Xiang and
Li, Haizhou",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2025",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingestion-acl-25/2025.findings-acl.101/",
pages = "1988--2003",
ISBN = "979-8-89176-256-5",
abstract = "Conversational Speech Synthesis (CSS) aims to align synthesized speech with the emotional and stylistic context of user-agent interactions to achieve empathy. Current generative CSS models face interpretability limitations due to insufficient emotional perception and redundant discrete speech coding. To address the above issues, we present Chain-Talker, a three-stage framework mimicking human cognition: Emotion Understanding derives context-aware emotion descriptors from dialogue history; Semantic Understanding generates compact semantic codes via serialized prediction; and Empathetic Rendering synthesizes expressive speech by integrating both components. To support emotion modeling, we develop CSS-EmCap, an LLM-driven automated pipeline for generating precise conversational speech emotion captions. Experiments on three benchmark datasets demonstrate that Chain-Talker produces more expressive and empathetic speech than existing methods, with CSS-EmCap contributing to reliable emotion modeling. The code and demos are available at: https://github.com/AI-S2-Lab/Chain-Talker."
}
Markdown (Informal)
[Chain-Talker: Chain Understanding and Rendering for Empathetic Conversational Speech Synthesis](https://preview.aclanthology.org/ingestion-acl-25/2025.findings-acl.101/) (Hu et al., Findings 2025)
ACL