@inproceedings{khalili-etal-2025-evaluating,
title = "Evaluating Intermediate Reasoning of Code-Assisted Large Language Models for Mathematics",
author = "Khalili, Zena Al and
Howell, Nick and
Klakow, Dietrich",
editor = "Dhole, Kaustubh and
Clinciu, Miruna",
booktitle = "Proceedings of the Fourth Workshop on Generation, Evaluation and Metrics (GEM{\texttwosuperior})",
month = jul,
year = "2025",
address = "Vienna, Austria and virtual meeting",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/transition-to-people-yaml/2025.gem-1.64/",
pages = "741--758",
ISBN = "979-8-89176-261-9",
abstract = "Assisting LLMs with code generation improved their performanceon mathematical reasoning tasks.However, the evaluation of code-assisted LLMs is generally restricted to execution correctness, lacking a rigorous evaluation of their generated programs.In this work, we bridge this gap by conducting an in-depth analysis of code-assisted LLMs generated programs in response to math reasoning tasks, with a focus on evaluating the soundness of the underlying reasoning processes. For this purpose, we assess the generations of five LLMs, on several math datasets, both manually and automatically, and propose a taxonomy of generated programs based on their logical soundness.Our findings show that the capabilities of models significantly impact the logic implemented to solve the problem. Closed-source LLMs ground their programs in mathematical concepts, whereas open-source models often resort to unsound reasoning, relying on memorized information and exhaustive searches. Furthermore, increasing the difficulty of problems decreases sound generations for all models, revealing a critical shortcoming of LLMs on complex mathematics, contrary to what accuracy metrics suggest.Our work highlights the need for more holistic evaluations of code-assisted LLMs beyond execution accuracy metrics, toward a better understanding of LLMs' limits in the math domain."
}
Markdown (Informal)
[Evaluating Intermediate Reasoning of Code-Assisted Large Language Models for Mathematics](https://preview.aclanthology.org/transition-to-people-yaml/2025.gem-1.64/) (Khalili et al., GEM 2025)
ACL