@inproceedings{yang-wang-2024-figclip,
title = "{F}ig{CLIP}: A Generative Multimodal Model with Bidirectional Cross-attention for Understanding Figurative Language via Visual Entailment",
author = "Yang, Qihao and
Wang, Xuelin",
editor = "Ghosh, Debanjan and
Muresan, Smaranda and
Feldman, Anna and
Chakrabarty, Tuhin and
Liu, Emmy",
booktitle = "Proceedings of the 4th Workshop on Figurative Language Processing (FigLang 2024)",
month = jun,
year = "2024",
address = "Mexico City, Mexico (Hybrid)",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/fix-sig-urls/2024.figlang-1.13/",
doi = "10.18653/v1/2024.figlang-1.13",
pages = "92--98",
abstract = "This is a system paper for the FigLang-2024 Multimodal Figurative Language Shared Task. Figurative language is generally represented through multiple modalities, facilitating the expression of complex and abstract ideas. With the popularity of various text-to-image tools, a large number of images containing metaphors or ironies are created. Traditional recognizing textual entailment has been extended to the task of understanding figurative language via visual entailment. However, existing pre-trained multimodal models in open domains often struggle with this task due to the intertwining of counterfactuals, human culture, and imagination. To bridge this gap, we propose FigCLIP, an end-to-end model based on CLIP and GPT-2, to identify multimodal figurative semantics and generate explanations. It employs a bidirectional fusion module with cross-attention and leverages explanations to promote the alignment of figurative image-text representations. Experimental results on the benchmark demonstrate the effectiveness of our method, achieving 70{\%} F1-score, 67{\%} F1@50-score and 50{\%} F1@60-score. It outperforms GPT-4V, which has robust visual reasoning capabilities."
}
Markdown (Informal)
[FigCLIP: A Generative Multimodal Model with Bidirectional Cross-attention for Understanding Figurative Language via Visual Entailment](https://preview.aclanthology.org/fix-sig-urls/2024.figlang-1.13/) (Yang & Wang, Fig-Lang 2024)
ACL