@inproceedings{zhao-li-2026-halo,
title = "The Halo Effect and Language Takeover: Spatiotemporal Attention Decay Explains Vision-Language Model Failures in Simple Visual Counting",
author = "Zhao, Haochen and
Li, Sujian",
editor = "Chang, Kai-Wei and
Mehrabi, Ninareh and
Krishna, Satyapriya and
Das, Anubrata and
Dhamala, Jwala and
Cao, Yang Trista and
Kumarage, Tharindu and
Ramakrishna, Anil and
Christodoulopoulos, Christos and
Wan, Yixin and
Galystan, Aram and
Kumar, Anoop and
Gupta, Rahul",
booktitle = "Proceedings of the 6th Workshop on Trustworthy {NLP} ({T}rust{NLP} 2026)",
month = jul,
year = "2026",
address = "San Diego, California",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl-workshops/2026.trustnlp-main.40/",
pages = "539--545",
ISBN = "979-8-89176-418-7",
abstract = "Despite their remarkable capabilities in complex multimodal reasoning, Vision Language Models (VLMs) exhibit a perplexing inability to perform elementary visual counting tasks reliably. Existing hypotheses, often centering on input resolution or patch tokenization, fail to fully explain the stochastic nature of these errors, particularly in multi-digit generation. In this work, we investigate the internal decision-making dynamics of VLMs (e.g., Qwen3-VL, Gemma3) through the lens of attention mechanisms. By leveraging a controlled synthetic dataset and introducing novel metrics for Visual \textit{Sparsity} and \textit{Entropy}, we discover a novel phenomenon: \textbf{Spatiotemporal Attention Decay}. Our analysis reveals two distinct failure modes. Spatially, models exhibit a \textbf{Halo Effect}, where attention focuses on the peripheral convex hull of object clusters rather than penetrating the geometric centers of individual instances. Temporally, we observe a phenomenon of \textbf{Language Takeover}: during auto-regressive decoding, visual grounding decays rapidly after the initial token. Quantitative analysis confirms that as attention sparsity drops and entropy rises, the generation of subsequent digits degenerates from visual perception into hallucination driven by language priors. These findings suggest that counting failures stem from the model{'}s inability to maintain spatiotemporal focus, highlighting the need for mechanisms that enforce persistent visual grounding."
}