@inproceedings{wang-shu-2025-spatial,
title = "Spatial-Aware Visual Program Guided Reasoning for Answering Complex Visual Questions",
author = "Wang, Haoran and
Shu, Kai",
editor = "Inui, Kentaro and
Sakti, Sakriani and
Wang, Haofen and
Wong, Derek F. and
Bhattacharyya, Pushpak and
Banerjee, Biplab and
Ekbal, Asif and
Chakraborty, Tanmoy and
Singh, Dhirendra Pratap",
booktitle = "Proceedings of the 14th International Joint Conference on Natural Language Processing and the 4th Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics",
month = dec,
year = "2025",
address = "Mumbai, India",
publisher = "The Asian Federation of Natural Language Processing and The Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-ijcnlp-aacl/2025.findings-ijcnlp.121/",
pages = "1942--1953",
ISBN = "979-8-89176-303-6",
abstract = "Visual Question Answering (VQA) often requires complex multi-hop reasoning encompassing both vision and language. Despite the remarkable performance of Large Multimodal Models (LMMs) in vision-language tasks, they encounter difficulties when faced with challenging scenarios that require complex reasoning and may be susceptible to object hallucination. This paper introduces a novel framework named Spatial-aware Visual Program Reasoning (SVPR). The primary goal of SVPR is to enhance the alignment between vision and language within LMMs, fostering their multi-hop reasoning abilities and ultimately strengthening their capacity to address complex visual reasoning tasks. We first utilize the strong visual understanding abilities of LMMs to generate scene graphs, facilitating coordination between vision and language at semantic levels. Then, we leverage the in-context learning ability of LMMs to generate visual programs, which guide the question decomposition process. Finally, we employ a program solver to execute the programs and derive the final answer. This process makes our approach both explanatory and robust, providing clear explanations of its reasoning process while ensuring the faithfulness of the answer to the visual input. We evaluate our framework on two challenging multi-hop multimodal VQA datasets and show its effectiveness under zero-shot settings."
}Markdown (Informal)
[Spatial-Aware Visual Program Guided Reasoning for Answering Complex Visual Questions](https://preview.aclanthology.org/ingest-ijcnlp-aacl/2025.findings-ijcnlp.121/) (Wang & Shu, Findings 2025)
ACL
- Haoran Wang and Kai Shu. 2025. Spatial-Aware Visual Program Guided Reasoning for Answering Complex Visual Questions. In Proceedings of the 14th International Joint Conference on Natural Language Processing and the 4th Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics, pages 1942–1953, Mumbai, India. The Asian Federation of Natural Language Processing and The Association for Computational Linguistics.