@inproceedings{shah-etal-2026-fade,
title = "{FADE}: Probing the Limits of {VLM}s on fine-grained {OCR}",
author = "Shah, Deep and
Kathrotia, Nehal and
Badhe, Sanket",
editor = "Yan, Qianqi and
Montariol, Syrielle and
Fan, Yue and
Gu, Jing and
Pan, Jiayi and
Li, Manling and
Kordjamshidi, Parisa and
Suhr, Alane and
Wang, Xin Eric",
booktitle = "Proceedings of the 4th Workshop on Advances in Language and Vision Research ({ALVR})",
month = jul,
year = "2026",
address = "San Diego, California, USA",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl-workshops/2026.alvr-main.23/",
pages = "249--259",
ISBN = "979-8-89176-398-2",
abstract = "Multimodal Large Language Models (MLLMs) have achieved remarkable success in semantic visual reasoning, yet their capacity for fine-grained, low-level perception remains critically under-evaluated. This perceptual fragility limits their reliability in noisy, real-world environments where visual signals are degraded. Furthermore, existing benchmarks often entangle visual perception with language priors, masking these underlying deficits. To address this, we introduce the **FAint numeric Detection Evaluation (FADE)** dataset, a novel evaluation suite designed to probe the limits of zero-shot Optical Character Recognition (OCR) in frontier MLLMs. By embedding synthetic, strictly numerical sequences over cluttered natural backgrounds at varying levels of transparency ($\alpha$), FADE explicitly disentangles pure visual perception from semantic predictability. We evaluate state-of-the-art models including Gemini 3.0, Claude 4.5 Sonnet, and Gemma 3 against a specialized UNet segmentation baseline. Our results reveal a striking limitation in frontier architectures: while they achieve near-perfect transcription at high visibility, their performance collapses under high transparency. Conversely, the UNet pipeline maintains robust spatial grounding, significantly outperforming generalist models at the lowest visibility thresholds. FADE provides a reproducible dataset to expose and diagnose the perceptual breakage points of modern multimodal systems."
}Markdown (Informal)
[FADE: Probing the Limits of VLMs on fine-grained OCR](https://preview.aclanthology.org/ingest-acl-workshops/2026.alvr-main.23/) (Shah et al., ALVR 2026)
ACL
- Deep Shah, Nehal Kathrotia, and Sanket Badhe. 2026. FADE: Probing the Limits of VLMs on fine-grained OCR. In Proceedings of the 4th Workshop on Advances in Language and Vision Research (ALVR), pages 249–259, San Diego, California, USA. Association for Computational Linguistics.