@inproceedings{zhang-etal-2025-mad,
title = "{T}-{MAD}: Target-driven Multimodal Alignment for Stance Detection",
author = "Zhang, ZhaoDan and
Zhang, Jin and
Cheng, Xueqi and
Xu, Hui",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-emnlp/2025.emnlp-main.30/",
pages = "580--595",
ISBN = "979-8-89176-332-6",
abstract = "Multimodal Stance Detection (MSD) aims to determine a user{'}s stance - support, oppose, or neutral - toward a target by analyzing multimodal content such as texts and images from social media. Existing MSD methods struggle with generalizing to unseen targets and handling modality inconsistencies. To address these challenges, we propose the Target-driven Multi-modal Alignment and Dynamic Weighting Model (T-MAD), which combines target-driven multi-modal alignment and dynamic weighting mechanisms to capture target-specific relationships and balance modality contributions. The model incorporates iterative reasoning to iteratively refine predictions, achieving robust performance in both in-target and zero-shot settings. Experiments on the MMSD and MultiClimate datasets show that T-MAD outperforms state-of-the-art models, with optimal results achieved using RoBERTa, ViT, and an iterative depth of 5. Ablation studies further confirm the importance of multi-modal alignment and dynamic weighting in enhancing model effectiveness."
}Markdown (Informal)
[T-MAD: Target-driven Multimodal Alignment for Stance Detection](https://preview.aclanthology.org/ingest-emnlp/2025.emnlp-main.30/) (Zhang et al., EMNLP 2025)
ACL