@inproceedings{shah-johnson-2025-n,
title = "N-{CORE}: N-View Consistency Regularization for Disentangled Representation Learning in Nonverbal Vocalizations",
author = "Shah, Siddhant Bikram and
Johnson, Kristina T.",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-luhme/2025.emnlp-main.1693/",
doi = "10.18653/v1/2025.emnlp-main.1693",
pages = "33362--33379",
ISBN = "979-8-89176-332-6",
abstract = "Nonverbal vocalizations are an essential component of human communication, conveying rich information without linguistic content. However, their computational analysis is hindered by a lack of lexical anchors in the data, compounded by biased and imbalanced data distributions. While disentangled representation learning has shown promise in isolating specific speech features, its application to nonverbal vocalizations remains unexplored. In this paper, we introduce N-CORE, a novel backbone-agnostic framework designed to disentangle intertwined features like emotion and speaker information from nonverbal vocalizations by leveraging N views of audio samples to learn invariance to specific transformations. N-CORE achieves competitive performance compared to state-of-the-art methods for emotion and speaker classification on the VIVAE, ReCANVo, and ReCANVo-Balanced datasets. We further propose an emotion perturbation function that disrupts affective information while preserving speaker information in audio signals for emotion-invariant speaker classification. Our work informs research directions on paralinguistic speech processing, including clinical diagnoses of atypical speech and longitudinal analysis of communicative development. Our code is available at https://github.com/SiddhantBikram/N-CORE."
}Markdown (Informal)
[N-CORE: N-View Consistency Regularization for Disentangled Representation Learning in Nonverbal Vocalizations](https://preview.aclanthology.org/ingest-luhme/2025.emnlp-main.1693/) (Shah & Johnson, EMNLP 2025)
ACL