@inproceedings{prasanth-2026-efficient,
title = "Efficient Visual Grounding in {VQA} via Question-Guided Sparse Attention",
author = "Prasanth",
editor = "Yan, Qianqi and
Montariol, Syrielle and
Fan, Yue and
Gu, Jing and
Pan, Jiayi and
Li, Manling and
Kordjamshidi, Parisa and
Suhr, Alane and
Wang, Xin Eric",
booktitle = "Proceedings of the 4th Workshop on Advances in Language and Vision Research ({ALVR})",
month = jul,
year = "2026",
address = "San Diego, California, USA",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl-workshops/2026.alvr-main.24/",
pages = "260--271",
ISBN = "979-8-89176-398-2",
abstract = "Visual Question Answering (VQA) models process all image patches uniformlydespite questions typically requiring only a small subset of visual information.This inefficiency leads to unnecessary computation and can result in attentiondilution across irrelevant image regions. We propose \textbf{Question-GuidedSparse Attention (QGSA)}, a plug-and-play mechanism that dynamically selectsrelevant image patches conditioned on question semantics. Our approach introducesthree components: (1)a differentiable patch selector based on Gumbel-Softmaxreparameterisation that enables end-to-end training with hard patch selection atinference; (2)a self-supervised grounding loss that encourages spatialselectivity without bounding-box annotations, combining contrastive patchselection with patch{--}word alignment via a frozen CLIP encoder; and (3)anadaptive sparsity mechanism that adjusts the number of selected patches accordingto estimated question complexity. Experiments on SmolVLM-256M-Instruct andSmolVLM-500M-Instruct across three VQA benchmarks (VQA-RAD, A-OKVQA, RefCOCO)demonstrate that QGSA reduces cross-attention FLOPs by 91{--}99{\%} across inputresolutions, achieving up to $76\times$ theoretical speedup at 576px resolution, whilemaintaining \textit{exact} accuracy parity with the dense baseline ($\Delta=0.0$ ppon all datasets).Wall-clock parity with the dense baseline is reached at 336px; realisedend-to-end speedup requires larger models where cross-attention dominates totalcompute. QGSA consistently selects an average of $k\approx17$ patches out of576 (256M model), up to $k\approx18$ (500M model), yielding up to a $34\times$reduction in the visual token sequence. These small-scale results validate thefeasibility of question-conditioned sparse attention and provide a foundation forscaling to larger VLMs."
}Markdown (Informal)
[Efficient Visual Grounding in VQA via Question-Guided Sparse Attention](https://preview.aclanthology.org/ingest-acl-workshops/2026.alvr-main.24/) (Prasanth, ALVR 2026)
ACL