@inproceedings{bukkapatnam-2026-compositional,
title = "The Compositional Grounding Gap: Why Vision-Language Models Fail at Relational Reasoning and How to Fix It",
author = "Bukkapatnam, Kaustubh S.",
editor = "Yan, Qianqi and
Montariol, Syrielle and
Fan, Yue and
Gu, Jing and
Pan, Jiayi and
Li, Manling and
Kordjamshidi, Parisa and
Suhr, Alane and
Wang, Xin Eric",
booktitle = "Proceedings of the 4th Workshop on Advances in Language and Vision Research ({ALVR})",
month = jul,
year = "2026",
address = "San Diego, California, USA",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl-workshops/2026.alvr-main.28/",
pages = "287--293",
ISBN = "979-8-89176-398-2",
abstract = "Large vision-language models (LVLMs) achieve strong performance on many multimodal tasks, yet consistently fail at compositional relational reasoning{---}distinguishing ``the cat on the mat'' from ``the mat on the cat.'' We provide a formal explanation for this failure. We prove that any vision-language alignment operating on pooled (order-invariant) visual features contains compositional blind spots: semantically distinct scenes that map to identical representations. We show that the number of blind spots grows factorially with scene complexity, establishing a fundamental limit on pooled-feature architectures. Motivated by this analysis, we propose REGROUND, a training-free, test-time method that re-introduces spatial structure into alignment by performing relation-guided cross-attention over spatial visual tokens, directed by a lightweight parse of the text query. Without any fine-tuning, REGROUND improves compositional accuracy by +8.6 points on Winoground, +8.4 on ARO-Relation, +6.4 on SugarCrepe, and +8.4 on VSR when applied to LLaVA-1.5, and provides consistent gains across other LVLMs. Ablation studies confirm that each component{---}parse guidance, token-level attention, and relation masking{---}contributes significantly."
}Markdown (Informal)
[The Compositional Grounding Gap: Why Vision-Language Models Fail at Relational Reasoning and How to Fix It](https://preview.aclanthology.org/ingest-acl-workshops/2026.alvr-main.28/) (Bukkapatnam, ALVR 2026)
ACL