@inproceedings{wu-etal-2024-mixture,
title = "Mixture-of-Prompt-Experts for Multi-modal Semantic Understanding",
author = "Wu, Zichen and
Huang, Hsiu-Yuan and
Qu, Fanyi and
Wu, Yunfang",
editor = "Calzolari, Nicoletta and
Kan, Min-Yen and
Hoste, Veronique and
Lenci, Alessandro and
Sakti, Sakriani and
Xue, Nianwen",
booktitle = "Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)",
month = may,
year = "2024",
address = "Torino, Italia",
publisher = "ELRA and ICCL",
url = "https://preview.aclanthology.org/add-emnlp-2024-awards/2024.lrec-main.995/",
pages = "11381--11393",
abstract = "Deep multimodal semantic understanding that goes beyond the mere superficial content relation mining has received increasing attention in the realm of artificial intelligence. The challenges of collecting and annotating high-quality multi-modal data have underscored the significance of few-shot learning. In this paper, we focus on two critical tasks under this context: few-shot multi-modal sarcasm detection (MSD) and multi-modal sentiment analysis (MSA). To address them, we propose Mixture-of-Prompt-Experts with Block-Aware Prompt Fusion (MoPE-BAF), a novel multi-modal soft prompt framework based on the unified vision-language model (VLM). Specifically, we design three experts of soft prompts: a text prompt and an image prompt that extract modality-specific features to enrich the single-modal representation, and a unified prompt to assist multi-modal interaction. Additionally, we reorganize Transformer layers into several blocks and introduce cross-modal prompt attention between adjacent blocks, which smoothens the transition from single-modal representation to multi-modal fusion. On both MSD and MSA datasets in few-shot setting, our proposed model not only surpasses the 8.2B model InstructBLIP with merely 2{\%} parameters (150M), but also significantly outperforms other widely-used prompt methods on VLMs or task-specific methods."
}
Markdown (Informal)
[Mixture-of-Prompt-Experts for Multi-modal Semantic Understanding](https://preview.aclanthology.org/add-emnlp-2024-awards/2024.lrec-main.995/) (Wu et al., LREC-COLING 2024)
ACL