@inproceedings{lindevelt-etal-2026-correlation,
title = "The Correlation Between Emotion in Text and Speech Segments is Limited: A Cross-Modal Study",
author = "Lindevelt, David and
Verberne, Suzan and
Broekens, Joost",
editor = "Demberg, Vera and
Inui, Kentaro and
Marquez, Llu{\'i}s",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {EACL} 2026",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-eacl/2026.findings-eacl.136/",
pages = "2611--2621",
ISBN = "979-8-89176-386-9",
abstract = "Although expressive TTS systems aim to capture human-like emotion, little is known about how well emotional signals in text correspond to those in speech. In this short paper, we investigate how emotion (Valence, Arousal, Dominance) in text relates to emotion in speech. We use 8 large language models for identifying emotion in text and two audio models for emotion in speech, across three genres: Podcasts, Audiobooks and TED talks. Findings show that while language models perform well on emotion recognition from situational text, and the audio models perform well on speech, they show a strong correlation for Valence only. Further, the genre of the content significantly impacts the correlation: audiobooks exhibit higher text-audio correlation than TED talks. Finally, we show that more context for LLMs fails to improve this correlation between text and speech emotion prediction. Our results highlight that emotional signals in text do not correspond well to those in speech: emotion prediction from text alone is insufficient for emotional TTS."
}Markdown (Informal)
[The Correlation Between Emotion in Text and Speech Segments is Limited: A Cross-Modal Study](https://preview.aclanthology.org/ingest-eacl/2026.findings-eacl.136/) (Lindevelt et al., Findings 2026)
ACL