@inproceedings{kanade-ganu-2026-see,
title = "Do You See Me : A Multidimensional Benchmark for Evaluating Visual Perception in Multimodal {LLM}s",
author = "Kanade, Aditya Sanjiv and
Ganu, Tanuja",
editor = "Demberg, Vera and
Inui, Kentaro and
Marquez, Llu{\'i}s",
booktitle = "Proceedings of the 19th Conference of the {E}uropean Chapter of the {A}ssociation for {C}omputational {L}inguistics (Volume 1: Long Papers)",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-eacl/2026.eacl-long.343/",
pages = "7285--7326",
ISBN = "979-8-89176-380-7",
abstract = "Multimodal Large Language Models (MLLMs) show reasoning promise, yet their visual perception is a critical bottleneck. Paradoxically, MLLMs sometimes produce correct answers while misinterpreting crucial visual elements, masking these underlying perception failures. Our preliminary analysis on a joint perception-reasoning dataset revealed that 29{\%} of correct reasoning answers from a leading MLLM contained perception errors. To systematically study visual perception abilities of MLLMs, we introduce \textbf{Do You See Me}- a scalable, programmatically generated benchmark with 1758 images and 2612 questions across seven core subtasks spanning 2D and 3D variants (twelve total tasks) providing parametric control over difficulty levels. The benchmark tasks are inspired by human psychology. Our evaluation of eleven leading MLLMs reveals a stark deficit: humans achieve 95.83{\%} accuracy, while top MLLMs average below 50{\%}. This performance gap widens drastically as task complexity increases. Further diagnostics show: (1) supervised finetuning offers only modest gains (11{\%}), (2) models tend to exploit task ``shortcuts'' like MCQ formats over detailed visual analysis, and (3) Chain-of-Thought prompting can degrade complex visual tasks by verbalizing images into lossy text. These findings expose the foundational perception limits in current MLLMs and highlight the need for robust visual perception improvements in MLLMs. The benchmark dataset, source code and evaluation scripts are available at[{\ensuremath{<}}https://github.com/microsoft/Do-You-See-Me{\ensuremath{>}}]."
}Markdown (Informal)
[Do You See Me : A Multidimensional Benchmark for Evaluating Visual Perception in Multimodal LLMs](https://preview.aclanthology.org/ingest-eacl/2026.eacl-long.343/) (Kanade & Ganu, EACL 2026)
ACL