@inproceedings{ling-etal-2022-vision,
title = "Vision-Language Pre-Training for Multimodal Aspect-Based Sentiment Analysis",
author = "Ling, Yan and
Yu, Jianfei and
Xia, Rui",
editor = "Muresan, Smaranda and
Nakov, Preslav and
Villavicencio, Aline",
booktitle = "Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = may,
year = "2022",
address = "Dublin, Ireland",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/jlcl-multiple-ingestion/2022.acl-long.152/",
doi = "10.18653/v1/2022.acl-long.152",
pages = "2149--2159",
abstract = "As an important task in sentiment analysis, Multimodal Aspect-Based Sentiment Analysis (MABSA) has attracted increasing attention inrecent years. However, previous approaches either (i) use separately pre-trained visual and textual models, which ignore the crossmodalalignment or (ii) use vision-language models pre-trained with general pre-training tasks, which are inadequate to identify fine-grainedaspects, opinions, and their alignments across modalities. To tackle these limitations, we propose a task-specific Vision-LanguagePre-training framework for MABSA (VLP-MABSA), which is a unified multimodal encoder-decoder architecture for all the pretrainingand downstream tasks. We further design three types of task-specific pre-training tasks from the language, vision, and multimodalmodalities, respectively. Experimental results show that our approach generally outperforms the state-of-the-art approaches on three MABSA subtasks. Further analysis demonstrates the effectiveness of each pre-training task. The source code is publicly released at \url{https://github.com/NUSTM/VLP-MABSA}."
}
Markdown (Informal)
[Vision-Language Pre-Training for Multimodal Aspect-Based Sentiment Analysis](https://preview.aclanthology.org/jlcl-multiple-ingestion/2022.acl-long.152/) (Ling et al., ACL 2022)
ACL