@inproceedings{khalili-etal-2025-evaluating,
title = "Evaluating Intermediate Reasoning of Code-Assisted Large Language Models for Mathematics",
author = "Khalili, Zena Al and
Howell, Nick and
Klakow, Dietrich",
editor = "Arviv, Ofir and
Clinciu, Miruna and
Dhole, Kaustubh and
Dror, Rotem and
Gehrmann, Sebastian and
Habba, Eliya and
Itzhak, Itay and
Mille, Simon and
Perlitz, Yotam and
Santus, Enrico and
Sedoc, Jo{\~a}o and
Shmueli Scheuer, Michal and
Stanovsky, Gabriel and
Tafjord, Oyvind",
booktitle = "Proceedings of the Fourth Workshop on Generation, Evaluation and Metrics (GEM{\texttwosuperior})",
month = jul,
year = "2025",
address = "Vienna, Austria and virtual meeting",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/nschneid-patch-1/2025.gem-1.64/",
pages = "741--758",
ISBN = "979-8-89176-261-9",
abstract = "Assisting LLMs with code generation improved their performanceon mathematical reasoning tasks.However, the evaluation of code-assisted LLMs is generally restricted to execution correctness, lacking a rigorous evaluation of their generated programs.In this work, we bridge this gap by conducting an in-depth analysis of code-assisted LLMs generated programs in response to math reasoning tasks, with a focus on evaluating the soundness of the underlying reasoning processes. For this purpose, we assess the generations of five LLMs, on several math datasets, both manually and automatically, and propose a taxonomy of generated programs based on their logical soundness.Our findings show that the capabilities of models significantly impact the logic implemented to solve the problem. Closed-source LLMs ground their programs in mathematical concepts, whereas open-source models often resort to unsound reasoning, relying on memorized information and exhaustive searches. Furthermore, increasing the difficulty of problems decreases sound generations for all models, revealing a critical shortcoming of LLMs on complex mathematics, contrary to what accuracy metrics suggest.Our work highlights the need for more holistic evaluations of code-assisted LLMs beyond execution accuracy metrics, toward a better understanding of LLMs' limits in the math domain."
}