@inproceedings{kurtic-etal-2024-mathador,
title = "Mathador-{LM}: A Dynamic Benchmark for Mathematical Reasoning on Large Language Models",
author = "Kurtic, Eldar and
Moeini, Amir and
Alistarh, Dan",
editor = "Al-Onaizan, Yaser and
Bansal, Mohit and
Chen, Yun-Nung",
booktitle = "Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/fix-sig-urls/2024.emnlp-main.946/",
doi = "10.18653/v1/2024.emnlp-main.946",
pages = "17020--17027",
abstract = "We introduce Mathador-LM, a new benchmark for evaluating the mathematical reasoning on large language models (LLMs), combining ruleset interpretation, planning, and problem-solving. This benchmark is inspired by the Mathador game, where the objective is to reach a target number using basic arithmetic operations on a given set of base numbers, following a simple set of rules. We show that, across leading LLMs, we obtain stable average performance while generating benchmark instances dynamically, following a target difficulty level. Thus, our benchmark alleviates concerns about test-set leakage into training data, an issue that often undermines popular benchmarks. Additionally, we conduct a comprehensive evaluation of both open and closed-source state-of-the-art LLMs on Mathador-LM. Our findings reveal that contemporary models struggle with Mathador-LM, scoring significantly lower than average 3rd graders. This stands in stark contrast to their strong performance on popular mathematical reasoning benchmarks. The implementation of Mathador-LM benchmark is available at https://github.com/IST-DASLab/Mathador-LM."
}
Markdown (Informal)
[Mathador-LM: A Dynamic Benchmark for Mathematical Reasoning on Large Language Models](https://preview.aclanthology.org/fix-sig-urls/2024.emnlp-main.946/) (Kurtic et al., EMNLP 2024)
ACL