@inproceedings{zhang-etal-2025-tiu,
title = "{TIU}-Bench: A Benchmark for Evaluating Large Multimodal Models on Text-rich Image Understanding",
author = "Zhang, Kun and
Niu, Liqiang and
Cao, Zhen and
Meng, Fandong and
Zhou, Jie",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2025",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/name-variant-enfa-fane/2025.findings-emnlp.1318/",
doi = "10.18653/v1/2025.findings-emnlp.1318",
pages = "24286--24295",
ISBN = "979-8-89176-335-7",
abstract = "Text-rich images are ubiquitous in real-world applications, serving as a critical medium for conveying complex information and facilitating accessibility.Despite recent advances driven by Multimodal Large Language Models (MLLMs), existing benchmarks suffer from limited scale, fragmented scenarios, and evaluation protocols that fail to fully capture holistic image understanding.To address these gaps, we present TIU-Bench, a large-scale, multilingual benchmark comprising over 100,000 full-image annotations and 22,000 rigorously validated question-answer (QA) pairs that span 18 subtasks across diverse real-world scenarios.TIU-Bench introduces a novel full-image structured output format that jointly models geometric, textual, and relational information, enabling fine-grained evaluation of perception and reasoning capabilities. Furthermore, we propose a two-stage understanding framework named T2TIU, which first generates a structured representation of the entire image and subsequently conducts reasoning on this representation to address complex visual-textual queries.Extensive experiments on 10 state-of-the-art generative models highlight the challenges and opportunities in advancing text-rich image understanding.Our benchmark and framework provide a comprehensive platform for developing and evaluating next-generation multimodal AI systems."
}Markdown (Informal)
[TIU-Bench: A Benchmark for Evaluating Large Multimodal Models on Text-rich Image Understanding](https://preview.aclanthology.org/name-variant-enfa-fane/2025.findings-emnlp.1318/) (Zhang et al., Findings 2025)
ACL