@inproceedings{sharma-etal-2024-losing,
title = "Losing Visual Needles in Image Haystacks: Vision Language Models are Easily Distracted in Short and Long Contexts",
author = "Sharma, Aditya and
Saxon, Michael and
Wang, William Yang",
editor = "Al-Onaizan, Yaser and
Bansal, Mohit and
Chen, Yun-Nung",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2024",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/Add-Cong-Liu-Florida-Atlantic-University-author-id/2024.findings-emnlp.312/",
doi = "10.18653/v1/2024.findings-emnlp.312",
pages = "5429--5451",
abstract = "We present LoCoVQA, a dynamic benchmark generator for evaluating long-context reasoning in vision language models (VLMs). LoCoVQA augments test examples for mathematical reasoning, VQA, and character recognition tasks with increasingly long visual contexts composed of both in-distribution and out-of-distribution distractor images.Across these tasks, a diverse set of VLMs rapidly lose performance as the visual context length grows, often exhibiting a striking logarithmic decay trend. This test assesses how well VLMs can ignore irrelevant information when answering queries{---}a task that is quite easy for language models (LMs) in the text domain{---}demonstrating that current state-of-the-art VLMs lack this essential capability for many long-context applications."
}
Markdown (Informal)
[Losing Visual Needles in Image Haystacks: Vision Language Models are Easily Distracted in Short and Long Contexts](https://preview.aclanthology.org/Add-Cong-Liu-Florida-Atlantic-University-author-id/2024.findings-emnlp.312/) (Sharma et al., Findings 2024)
ACL