@inproceedings{lian-etal-2025-beyond,
title = "Beyond Surface Simplicity: Revealing Hidden Reasoning Attributes for Precise Commonsense Diagnosis",
author = "Lian, Huijun and
Sun, Zekai and
Chen, Keqi and
Gao, Yingming and
Li, Ya",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingestion-acl-25/2025.acl-long.627/",
pages = "12820--12835",
ISBN = "979-8-89176-251-0",
abstract = "Commonsense question answering (QA) are widely used to evaluate the commonsense abilities of large language models. However, answering commonsense questions correctly requires not only knowledge but also reasoning{---}even for seemingly simple questions. We demonstrate that such hidden reasoning attributes in commonsense questions can lead evaluation accuracy differences of up to 24.8{\%} across different difficulty levels in the same benchmark. Current benchmarks overlook these hidden reasoning attributes, making it difficult to assess a model{'}s specific levels of commonsense knowledge and reasoning ability. To address this issue, we introduce ReComSBench, a novel framework that reveals hidden reasoning attributes behind commonsense questions by leveraging the knowledge generated during the reasoning process. Additionally, ReComSBench proposes three new metrics for decoupled evaluation: Knowledge Balanced Accuracy, Marginal Sampling Gain, and Knowledge Coverage Ratio. Experiments show that ReComSBench provides insights into model performance that traditional benchmarks cannot offer. The difficulty stratification based on revealed hidden reasoning attributes performs as effectively as the model-probability-based approach but is more generalizable and better suited for improving a model{'}s commonsense reasoning abilities. By uncovering and analyzing the hidden reasoning attributes in commonsense data, ReComSBench offers a new approach to enhancing existing commonsense benchmarks."
}
Markdown (Informal)
[Beyond Surface Simplicity: Revealing Hidden Reasoning Attributes for Precise Commonsense Diagnosis](https://preview.aclanthology.org/ingestion-acl-25/2025.acl-long.627/) (Lian et al., ACL 2025)
ACL