@inproceedings{khandelwal-etal-2023-analyzing,
title = "Analyzing Modular Approaches for Visual Question Decomposition",
author = "Khandelwal, Apoorv and
Pavlick, Ellie and
Sun, Chen",
editor = "Bouamor, Houda and
Pino, Juan and
Bali, Kalika",
booktitle = "Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing",
month = dec,
year = "2023",
address = "Singapore",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/jlcl-multiple-ingestion/2023.emnlp-main.157/",
doi = "10.18653/v1/2023.emnlp-main.157",
pages = "2590--2603",
abstract = "Modular neural networks without additional training have recently been shown to surpass end-to-end neural networks on challenging vision{--}language tasks. The latest such methods simultaneously introduce LLM-based code generation to build programs and a number of skill-specific, task-oriented modules to execute them. In this paper, we focus on ViperGPT and ask where its additional performance comes from and how much is due to the (state-of-art, end-to-end) BLIP-2 model it subsumes vs. additional symbolic components. To do so, we conduct a controlled study (comparing end-to-end, modular, and prompting-based methods across several VQA benchmarks). We find that ViperGPT`s reported gains over BLIP-2 can be attributed to its selection of task-specific modules, and when we run ViperGPT using a more task-agnostic selection of modules, these gains go away. ViperGPT retains much of its performance if we make prominent alterations to its selection of modules: e.g. removing or retaining only BLIP-2. We also compare ViperGPT against a prompting-based decomposition strategy and find that, on some benchmarks, modular approaches significantly benefit by representing subtasks with natural language, instead of code. Our code is fully available at https://github.com/brown-palm/visual-question-decomposition."
}
Markdown (Informal)
[Analyzing Modular Approaches for Visual Question Decomposition](https://preview.aclanthology.org/jlcl-multiple-ingestion/2023.emnlp-main.157/) (Khandelwal et al., EMNLP 2023)
ACL