@inproceedings{wang-etal-2021-mirtt-learning,
title = "{MIRTT}: Learning Multimodal Interaction Representations from Trilinear Transformers for Visual Question Answering",
author = "Wang, Junjie and
Ji, Yatai and
Sun, Jiaqi and
Yang, Yujiu and
Sakai, Tetsuya",
editor = "Moens, Marie-Francine and
Huang, Xuanjing and
Specia, Lucia and
Yih, Scott Wen-tau",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2021",
month = nov,
year = "2021",
address = "Punta Cana, Dominican Republic",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/fix-sig-urls/2021.findings-emnlp.196/",
doi = "10.18653/v1/2021.findings-emnlp.196",
pages = "2280--2292",
abstract = "In Visual Question Answering (VQA), existing bilinear methods focus on the interaction between images and questions. As a result, the answers are either spliced into the questions or utilized as labels only for classification. On the other hand, trilinear models such as the CTI model efficiently utilize the inter-modality information between answers, questions, and images, while ignoring intra-modality information. Inspired by this observation, we propose a new trilinear interaction framework called MIRTT (Learning Multimodal Interaction Representations from Trilinear Transformers), incorporating the attention mechanisms for capturing inter-modality and intra-modality relationships. Moreover, we design a two-stage workflow where a bilinear model reduces the free-form, open-ended VQA problem into a multiple-choice VQA problem. Furthermore, to obtain accurate and generic multimodal representations, we pre-train MIRTT with masked language prediction. Our method achieves state-of-the-art performance on the Visual7W Telling task and VQA-1.0 Multiple Choice task and outperforms bilinear baselines on the VQA-2.0, TDIUC and GQA datasets."
}
Markdown (Informal)
[MIRTT: Learning Multimodal Interaction Representations from Trilinear Transformers for Visual Question Answering](https://preview.aclanthology.org/fix-sig-urls/2021.findings-emnlp.196/) (Wang et al., Findings 2021)
ACL