@inproceedings{michel-etal-2026-computational,
title = "Computational Narrative Understanding for Expressive Text-to-Speech",
author = "Michel, Gaspard and
Epure, Elena V. and
Cerisara, Christophe",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {ACL} 2026",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl/2026.findings-acl.308/",
pages = "6194--6215",
ISBN = "979-8-89176-395-1",
abstract = "Recent advances in text-to-speech (TTS) have been driven by large, multi-domain speech corpora, yet the expressive potential of audiobook data remains underexamined. We argue that human-narrated audiobooks, particularly fictional works, contain rich and diverse prosodic cues arising from the natural alternation between neutral narration and expressive character dialogue. Building from this observation, we introduce LibriQuote, a large-scale 5.3K hours of expressive speech drawn from character quotations.Each quote is supplemented with contextual pseudo-labels for speech verbs and adverbs that characterize the intended delivery of direct speech (e.g., ``\textit{he whispered softly}'').We found that fine-tuning a flow-matching model on LibriQuote yields substantial improvements in expressivity and intelligibility, while training from scratch enhances expressiveness of an autoregressive TTS model.Benchmarking on LibriQuote-\textit{test} highlights significant variability across systems in generating expressive speech.We publicly release the dataset, code, and evaluation resources to facilitate reproducibility.Audio samples can be found at \url{https://libriquote.github.io/}."
}Markdown (Informal)
[Computational Narrative Understanding for Expressive Text-to-Speech](https://preview.aclanthology.org/ingest-acl/2026.findings-acl.308/) (Michel et al., Findings 2026)
ACL