@inproceedings{girish-etal-2026-prosody,
title = "Prosody as Supervision: Bridging the Non-Verbal{--}Verbal for Multilingual Speech Emotion Recognition",
author = "Girish and
Akhtar, Mohd Mujtaba and
Singh, Muskaan",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Proceedings of the 64th Annual Meeting of the {A}ssociation for {C}omputational {L}inguistics (Volume 1: Long Papers)",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl/2026.acl-long.1940/",
pages = "41881--41893",
ISBN = "979-8-89176-390-6",
abstract = "In this work, we introduce a paralinguistic supervision paradigm for low-resource multilingual speech emotion recognition (LRM-SER) that leverages non-verbal vocalizations to exploit prosody-centric emotion cues. Unlike conventional SER systems that rely heavily on labeled verbal speech and suffer from poor cross-lingual transfer, our approach reformulates LRM-SER as non-verbal-to-verbal transfer, where supervision from a labelled non-verbal source domain is adapted to unlabeled verbal speech across multiple target languages. To this end, we propose NOVA-ARC, a geometry-aware framework that models affective structure in the Poincar{\'e} ball, discretizes paralinguistic patterns via a hyperbolic vector-quantized prosody codebook, and captures emotion intensity through a hyperbolic emotion lens. For unsupervised adaptation, NOVA-ARC performs optimal-transport-based prototype alignment between source emotion prototypes and target utterances, inducing soft supervision for unlabeled speech while being stabilized through consistency regularization. Experiments show that NOVA-ARC delivers the strongest performance under both non-verbal-to-verbal adaptation and the complementary verbal-to-verbal transfer setting, consistently outperforming Euclidean counter parts and strong SSL baselines. To the best of our knowledge, this work is the first to move beyond verbal-speech{--}centric supervision by introducing a non-verbal{--}to{--}verbal transfer paradigm for SER."
}Markdown (Informal)
[Prosody as Supervision: Bridging the Non-Verbal–Verbal for Multilingual Speech Emotion Recognition](https://preview.aclanthology.org/ingest-acl/2026.acl-long.1940/) (Girish et al., ACL 2026)
ACL