@inproceedings{wu-etal-2025-c2rbench,
title = "{C}{\texttwosuperior}{RB}ench: A {C}hinese Complex Reasoning Benchmark for Large Language Models",
author = "Wu, Junru and
Shen, Tianhao and
Su, Linxi and
Xiong, Deyi",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2025",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/display_plenaries/2025.findings-acl.1083/",
pages = "21031--21050",
ISBN = "979-8-89176-256-5",
abstract = "Large language models (LLMs) have achieved remarkable progress in autonomous reasoning, evolving from basic text processing to sophisticated multimodal reasoning, a critical capability for general-purpose AI assistants. However, existing benchmarks usually fail to adequately capture the intricate multi-step reasoning demands inherent in real-world scenarios. To bridge this gap, we propose **C{\texttwosuperior}RBench**: a **C**hinese **C**omplex **R**easoning **Bench**mark for evaluating multi-step, multimodal advanced reasoning capability of LLMs. C{\texttwosuperior}RBench comprises 1,115 carefully curated Chinese tasks, which are organized into eight domain-specific subsets, each meticulously designed to mirror real-world challenges. This hierarchical benchmark features three difficulty tiers based on the number of reasoning steps required (average 8.44 steps per task), significantly exceeding existing benchmarks in cognitive complexity. Extensive evaluations of 20 LLMs (including DeepSeek-R1) and 24 multimodal large language models (MLLMs) on C{\texttwosuperior}RBench reveal critical performance gaps: GPT-4.1 achieves only 52.11{\%} accuracy, indicating substantial room for improvement. The dataset and evaluation code are publicly available."
}
Markdown (Informal)
[C²RBench: A Chinese Complex Reasoning Benchmark for Large Language Models](https://preview.aclanthology.org/display_plenaries/2025.findings-acl.1083/) (Wu et al., Findings 2025)
ACL