@inproceedings{duan-etal-2026-bloomeval,
title = "{B}loom{E}val: A Bloom{'}s Cognitive Taxonomy-Based Benchmark for Evaluating {LRM}s via Cognitive Hierarchy Trace",
author = "Duan, Zhiyi and
Gao, Lei and
Guan, Jiangshan and
Wang, Qi and
Liu, Rui",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {ACL} 2026",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl/2026.findings-acl.1262/",
pages = "25228--25248",
ISBN = "979-8-89176-395-1",
abstract = "Current benchmarks for Large Reasoning Models (LRMs) primarily rely on answer correctness, failing to assess the structural coherence and cognitive soundness of the reasoning process itself. To address this gap, we introduce Cognitive Hierarchy Trace (CHT), a novel evaluation framework grounded in Bloom{'}s Cognitive Taxonomy (BCT). CHT provides a structured, step-wise mapping of a model{'}s reasoning trajectory onto hierarchical cognitive levels, enabling the detection of structural anomalies such as hierarchy jumps, breaks, and overthinking. Based on CHT, we present BloomEval, the first large-scale benchmark designed for fine-grained cognitive capability assessment. It comprises 94,602 math problems, each annotated with Bloom{'}s cognitive levels, CHT trajectories, a three-tier knowledge hierarchy, and problem difficulty. To ensure scalable yet reliable annotation, we develop an Expert-LLM collaborative pipeline with a three-stage reconciliation mechanism. Our comprehensive evaluation reveals a critical finding: models often arrive at correct answers through cognitively flawed or opaque reasoning paths. The CHT-based analysis uncovers prevalent structural inconsistencies that are invisible to outcome-only metrics, demonstrating that answer accuracy is an insufficient proxy for reasoning quality."
}Markdown (Informal)
[BloomEval: A Bloom’s Cognitive Taxonomy-Based Benchmark for Evaluating LRMs via Cognitive Hierarchy Trace](https://preview.aclanthology.org/ingest-acl/2026.findings-acl.1262/) (Duan et al., Findings 2026)
ACL