@inproceedings{yang-etal-2025-magic,
title = "{MAGIC}-{VQA}: Multimodal And Grounded Inference with Commonsense Knowledge for Visual Question Answering",
author = "Yang, Shuo and
Han, Caren and
Luo, Siwen and
Hovy, Eduard",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2025",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/display_plenaries/2025.findings-acl.872/",
pages = "16967--16986",
ISBN = "979-8-89176-256-5",
abstract = "Visual Question Answering (VQA) necessitates models to reason effectively across visual and textual modalities. However, existing Large Vision-Language Models (LVLMs) often fall short in achieving human-like reasoning due to a lack of integrated commonsense knowledge, limiting their robustness and accuracy in real-world scenarios where both explicit facts and implicit understanding are crucial. To address this challenge, we present MAGIC-VQA: Multimodal And Grounded Inference with Commonsense Knowledge, a novel framework designed to enhance multimodal inference by integrating commonsense reasoning. MAGIC-VQA introduces a three-stage process: (1) Explicit Commonsense Knowledge Retrieval from external knowledge graphs, (2) By-Type Commonsense Knowledge Post-Processing to refine contextual relevance, and (3) Implicit Commonsense Knowledge Augmentation using a heterogeneous graph processed by a Graph Neural Network (GNN). These stages collectively enable nuanced, context-aware reasoning without extensive pre-training or intricate prompt tuning.Our MAGIC-VQA significantly improves comprehensive benchmark datasets, surpassing existing models in tasks requiring advanced commonsense reasoning. MAGIC-VQA establishes a robust pathway for integrating commonsense knowledge into VQA, bridging the gap between vision-language inputs and high-level reasoning for improved reliability and contextual accuracy."
}
Markdown (Informal)
[MAGIC-VQA: Multimodal And Grounded Inference with Commonsense Knowledge for Visual Question Answering](https://preview.aclanthology.org/display_plenaries/2025.findings-acl.872/) (Yang et al., Findings 2025)
ACL