@inproceedings{zhang-graf-2025-mathematical,
title = "Mathematical Computation and Reasoning Errors by Large Language Models",
author = "Zhang, Liang and
Graf, Edith",
editor = "Wilson, Joshua and
Ormerod, Christopher and
Beiting Parrish, Magdalen",
booktitle = "Proceedings of the Artificial Intelligence in Measurement and Education Conference (AIME-Con): Full Papers",
month = oct,
year = "2025",
address = "Wyndham Grand Pittsburgh, Downtown, Pittsburgh, Pennsylvania, United States",
publisher = "National Council on Measurement in Education (NCME)",
url = "https://preview.aclanthology.org/author-page-yu-wang-polytechnic/2025.aimecon-main.45/",
pages = "417--424",
ISBN = "979-8-218-84228-4",
abstract = "We evaluate four LLMs (GPT-4o, o1, DeepSeek-V3, DeepSeek-R1) on purposely challenging arithmetic, algebra, and number-theory items. Coding final answers and step-level solutions correctness reveals performance gaps, improvement paths, and how accurate LLMs can strengthen mathematics assessment and instruction."
}Markdown (Informal)
[Mathematical Computation and Reasoning Errors by Large Language Models](https://preview.aclanthology.org/author-page-yu-wang-polytechnic/2025.aimecon-main.45/) (Zhang & Graf, AIME-Con 2025)
ACL
- Liang Zhang and Edith Graf. 2025. Mathematical Computation and Reasoning Errors by Large Language Models. In Proceedings of the Artificial Intelligence in Measurement and Education Conference (AIME-Con): Full Papers, pages 417–424, Wyndham Grand Pittsburgh, Downtown, Pittsburgh, Pennsylvania, United States. National Council on Measurement in Education (NCME).