@inproceedings{kil-etal-2024-ii,
title = "{II}-{MMR}: Identifying and Improving Multi-modal Multi-hop Reasoning in Visual Question Answering",
author = "Kil, Jihyung and
Tavazoee, Farideh and
Kang, Dongyeop and
Kim, Joo-Kyung",
editor = "Ku, Lun-Wei and
Martins, Andre and
Srikumar, Vivek",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2024",
month = aug,
year = "2024",
address = "Bangkok, Thailand",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/jlcl-multiple-ingestion/2024.findings-acl.636/",
doi = "10.18653/v1/2024.findings-acl.636",
pages = "10698--10709",
abstract = "Visual Question Answering (VQA) often involves diverse reasoning scenarios across Vision and Language (V{\&}L). Most prior VQA studies, however, have merely focused on assessing the model`s overall accuracy without evaluating it on different reasoning cases. Furthermore, some recent works observe that conventional Chain-of-Thought (CoT) prompting fails to generate effective reasoning for VQA, especially for complex scenarios requiring multi-hop reasoning. In this paper, we propose II-MMR, a novel idea to identify and improve multi-modal multi-hop reasoning in VQA. In specific, II-MMR takes a VQA question with an image and finds a reasoning path to reach its answer using two novel language promptings: (i) answer prediction-guided CoT prompt, or (ii) knowledge triplet-guided prompt. II-MMR then analyzes this path to identify different reasoning cases in current VQA benchmarks by estimating how many hops and what types (i.e., visual or beyond-visual) of reasoning are required to answer the question. On popular benchmarks including GQA and A-OKVQA, II-MMR observes that most of their VQA questions are easy to answer, simply demanding {\textquotedblleft}single-hop{\textquotedblright} reasoning, whereas only a few questions require {\textquotedblleft}multi-hop{\textquotedblright} reasoning. Moreover, while the recent V{\&}L model struggles with such complex multi-hop reasoning questions even using the traditional CoT method, II-MMR shows its effectiveness across all reasoning cases in both zero-shot and fine-tuning settings."
}
Markdown (Informal)
[II-MMR: Identifying and Improving Multi-modal Multi-hop Reasoning in Visual Question Answering](https://preview.aclanthology.org/jlcl-multiple-ingestion/2024.findings-acl.636/) (Kil et al., Findings 2024)
ACL