@inproceedings{zhu-feng-2026-attenuation,
title = "From Attenuation to Attention: Variational Information Flow Manipulation for Fine-Grained Visual Perception",
author = "Zhu, Jilong and
Feng, Yang",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {ACL} 2026",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl-workshops/2026.findings-acl.927/",
pages = "18586--18597",
ISBN = "979-8-89176-395-1",
abstract = "While Multimodal Large Language Models (MLLMs) have demonstrated impressive capabilities in general visual understanding, they frequently falter in fine-grained perception tasks that require identifying tiny objects or discerning subtle visual relationships. We attribute this limitation to Visual Attenuation: a phenomenon where sparse fine-grained visual signals are prematurely suppressed or diluted by dominant textual tokens during network propagation, resulting in a ``loss of focus'' during the deep-level decision-making process. Existing input-centric solutions fail to fundamentally reverse this intrinsic mechanism of information loss. To address this challenge, we propose the Variational Information Flow (VIF) framework. Adopting a probabilistic perspective, VIF leverages a Conditional Variational Autoencoder (CVAE) to model the visual saliency relevant to the question-answer pair as a latent distribution. As a plug-and-play module, VIF can be integrated into existing architectures. Extensive evaluations across diverse benchmarks{---}covering General VQA, fine-grained perception, and visual grounding{---}demonstrate that VIF yields competitive improvements over previous methods, validating its effectiveness in enhancing the fine-grained perception of MLLMs. Codes are available at https://github.com/ictnlp/VIF."
}Markdown (Informal)
[From Attenuation to Attention: Variational Information Flow Manipulation for Fine-Grained Visual Perception](https://preview.aclanthology.org/ingest-acl-workshops/2026.findings-acl.927/) (Zhu & Feng, Findings 2026)
ACL