@inproceedings{ye-etal-2025-mmscibench,
title = "{MMS}ci{B}ench: Benchmarking Language Models on {C}hinese Multimodal Scientific Problems",
author = "Ye, Xinwu and
Li, Chengfan and
Chen, Siming and
Wei, Wei and
Tang, Robert",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2025",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/display_plenaries/2025.findings-acl.755/",
pages = "14621--14663",
ISBN = "979-8-89176-256-5",
abstract = "Recent advances in large language models (LLMs) and vision-language models (LVLMs) have shown promise across many tasks, yet their scientific reasoning capabilities remain untested, particularly in multimodal settings. We present MMSciBench, a benchmark for evaluating mathematical and physical reasoning through text-only and text-image formats, with human-annotated difficulty levels, solutions with detailed explanations, and taxonomic mappings. Evaluation of state-of-the-art models reveals significant limitations, with even the best model achieving only 63.77{\%} accuracy and particularly struggling with visual reasoning tasks. Our analysis exposes critical gaps in complex reasoning and visual-textual integration, establishing MMSciBench as a rigorous standard for measuring progress in multimodal scientific understanding. The code for MMSciBench is open-sourced at GitHub, and the dataset is available at Hugging Face."
}
Markdown (Informal)
[MMSciBench: Benchmarking Language Models on Chinese Multimodal Scientific Problems](https://preview.aclanthology.org/display_plenaries/2025.findings-acl.755/) (Ye et al., Findings 2025)
ACL