@inproceedings{tang-sun-2025-big,
title = "Big Escape Benchmark: Evaluating Human-Like Reasoning in Language Models via Real-World Escape Room Challenges",
author = "Tang, Zinan and
Sun, QiYao",
editor = "Dhole, Kaustubh and
Clinciu, Miruna",
booktitle = "Proceedings of the Fourth Workshop on Generation, Evaluation and Metrics (GEM{\texttwosuperior})",
month = jul,
year = "2025",
address = "Vienna, Austria and virtual meeting",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/corrections-2025-08/2025.gem-1.42/",
pages = "488--503",
ISBN = "979-8-89176-261-9",
abstract = "Large Language Models (LLMs) have recently demonstrated remarkable reasoning capabilities across a wide range of tasks. While many benchmarks have been developed on specific academic subjects, coding, or constrained visual tasks, they often fail to fully capture the breadth, diversity, and dynamic nature of real-world human reasoning. Further, the creation of high-quality, complex multimodal reasoning benchmarks typically requires significant manual effort and expert annotation, which is costly and time-consuming.To address these limitations, we introduce Big Escape Bench, a novel multimodal reasoning benchmark derived from popular reality shows and television programs. Big Escape Bench leverages unique characteristics of TV content, providing a rich source of challenging and realistic multimodal reasoning problems. Key advantages include: questions guaranteed to be human-solvable and of moderate difficulty; problems reflecting diverse, real-world scenarios and knowledge domains; high inherent quality due to content generated by professional program teams.Notably, we develop an automated pipeline to construct the data from these programs into a standardized benchmark format, significantly reducing the manual effort compared to traditional dataset construction. We have conducted extensive experiments to evaluate state-of-the-art (SOTA) LLMs and Multimodal Large Language Models (MLLMs) on Big Escape Bench. Our results reveal a surprising performance gap: while the questions are easily solved by human viewers (about 60{\%} in accuracy), the performance of even the most advanced models (best 40.50{\%} in accuracy) is significantly lower than human-level accuracy. This highlights that despite recent progress, MLLMs still face substantial challenges in robustly performing the kind of diverse, dynamic, and context-dependent reasoning that is trivial for humans in routine situations. Big Escape Bench serves as a valuable tool for identifying current limitations of MLLMs and fostering future research towards more human-like multimodal reasoning."
}
Markdown (Informal)
[Big Escape Benchmark: Evaluating Human-Like Reasoning in Language Models via Real-World Escape Room Challenges](https://preview.aclanthology.org/corrections-2025-08/2025.gem-1.42/) (Tang & Sun, GEM 2025)
ACL