@inproceedings{cabrera-etal-2026-long,
title = "Too long; didn{'}t solve",
author = "Cabrera, Luc{\'i}a and
D{'}Arcy, Jocelyn and
Saxton-Knight, Isaac",
editor = "Akhtar, Mubashara and
Batzner, Jan and
Choshen, Leshem and
Ghosh, Avijit and
Gohar, Usman and
Mickel, Jennifer and
Pant, Ichhya and
Talat, Zeerak and
Lin, Michelle",
booktitle = "Proceedings of the Workshop on Evaluating Evaluations ({E}val{E}val)",
month = jul,
year = "2026",
address = "San Diego, CA",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl-workshops/2026.evaleval-1.20/",
pages = "100--110",
ISBN = "979-8-89176-429-3",
abstract = "Mathematical benchmarks consisting of a range of mathematics problems are widely used to evaluate the reasoning abilities of large language models, yet little is known about how their structural properties influence model behaviour. In this work, we investigate two structural length variables, prompt length and solution length, and analyse how they relate to model performance on a newly constructed adversarial dataset of expert-authored mathematics problems. Across five evaluated models, we find that both prompt length and solution length are positively associated with model failure. These associations are statistically significant but modest, and we interpret them as descriptive rather than causal. We also include a secondary, exploratory analysis of cross-model disagreement. Because disagreement measures based on variance are mechanically constrained by mean failure, we treat this part of the analysis cautiously. Overall, our main finding is that structural length is linked to empirical difficulty in this benchmark, suggesting that length should be considered as a potential confounder when interpreting mathematical model evaluations."
}Markdown (Informal)
[Too long; didn’t solve](https://preview.aclanthology.org/ingest-acl-workshops/2026.evaleval-1.20/) (Cabrera et al., EvalEval 2026)
ACL
- Lucía Cabrera, Jocelyn D’Arcy, and Isaac Saxton-Knight. 2026. Too long; didn’t solve. In Proceedings of the Workshop on Evaluating Evaluations (EvalEval), pages 100–110, San Diego, CA. Association for Computational Linguistics.