@inproceedings{helzerman-etal-2026-hard,
title = "How Hard is Math? Using Quantitative Metrics to Measure {LLM} Alignment to Human Intuitions of Difficulty",
author = "Helzerman, Micah and
Wilson, Steven R and
McLeman, Cam",
editor = "T.Y.S.S., Santosh and
Rodriguez, Juan Diego and
de Gibert, Ona",
booktitle = "Proceedings of the 64th Annual Meeting of the {A}ssociation for {C}omputational {L}inguistics ({ACL} 2026)",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl/2026.acl-srw.85/",
pages = "968--981",
ISBN = "979-8-89176-393-7",
abstract = "Modern LLMs have demonstrated advanced reasoning skills, including the ability to solve Olympiad-level mathematics problems. While solving more and more difficult problems is a hallmark of LLM progress, less attention has been placed on how ``difficulty'' is operationalized in the context of LLM problem solving tasks. This is particularly relevant in educational contexts where teachers or students may ask LLMs for ``easy'' or ``hard'' questions. In this paper, we explore various quantitative measurements from LLM-generated solutions and evaluate their inter-correlations, as well as their correlation to human-annotated difficulty scores. We find moderate correlations between metrics using log probabilities and output lengths, including some that are more strongly correlated to difficulty than LLM accuracy. We also train ModernBERT to predict difficulty scores, leading to reasonable accuracy within a given benchmark, but decreased performance when generalizing to other math benchmarks. Finally, to explore connections between difficulty scores and human performance, we collect problems, human solutions, and human performance data from the Putnam competition. We find poor alignment between LLM metrics and human-assigned difficulty scores, despite strong correlations between those scores and human performance on the problems."
}Markdown (Informal)
[How Hard is Math? Using Quantitative Metrics to Measure LLM Alignment to Human Intuitions of Difficulty](https://preview.aclanthology.org/ingest-acl/2026.acl-srw.85/) (Helzerman et al., ACL 2026)
ACL