@inproceedings{tian-etal-2025-multi,
title = "Multi-Condition Guided Diffusion Network for Multimodal Emotion Recognition in Conversation",
author = "Tian, Wenjin and
Huang, Xianying and
Zou, Shihao",
editor = "Chiruzzo, Luis and
Ritter, Alan and
Wang, Lu",
booktitle = "Findings of the Association for Computational Linguistics: NAACL 2025",
month = apr,
year = "2025",
address = "Albuquerque, New Mexico",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/fix-sig-urls/2025.findings-naacl.177/",
pages = "3215--3227",
ISBN = "979-8-89176-195-7",
abstract = "Emotion recognition in conversation (ERC) involves identifying emotional labels associated with utterances within a conversation, a task that is essential for developing empathetic robots. Current research emphasizes contextual factors, the speaker{'}s influence, and extracting complementary information across different modalities. However, it often overlooks the cross-modal noise at the semantic level and the redundant information brought by the features themselves. This study introduces a diffusion-based approach designed to effectively address the challenges posed by redundant information and unexpected noise while robustly capturing shared semantics, thus facilitating the learning of compact and representative features from multimodal data. Specifically, we present the Multi-Condition Guided Diffusion Network (McDiff). McDiff employs a modal prior knowledge extraction strategy to derive the prior distribution for each modality, thereby enhancing the regional attention of each modality and applying the generated prior distribution at each diffusion step. Furthermore, we propose a method to learn the mutual information of each modality through a specific objective constraints approach prior to the forward process, which aims to improve inter-modal interaction and mitigate the effects of noise and redundancy. Comprehensive experiments conducted on two multimodal datasets, IEMOCAP and MELD, demonstrate that McDiff significantly surpasses existing state-of-the-art methodologies, thereby affirming the generalizability and efficacy of the proposed model."
}
Markdown (Informal)
[Multi-Condition Guided Diffusion Network for Multimodal Emotion Recognition in Conversation](https://preview.aclanthology.org/fix-sig-urls/2025.findings-naacl.177/) (Tian et al., Findings 2025)
ACL