@inproceedings{khrulev-2025-check,
title = "{CHECK}-{MAT}: Probing the Mathematical Reasoning and Rubric-Alignment of Vision-Language Models on Handwritten Solutions",
author = "Khrulev, Ruslan",
editor = "Valentino, Marco and
Ferreira, Deborah and
Thayaparan, Mokanarangan and
Ranaldi, Leonardo and
Freitas, Andre",
booktitle = "Proceedings of The 3rd Workshop on Mathematical Natural Language Processing (MathNLP 2025)",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-emnlp/2025.mathnlp-main.6/",
pages = "77--94",
ISBN = "979-8-89176-348-7",
abstract = "The application of contemporary NLP models for inference over mathematical text remains a critical and under-explored area. While Vision-Language Models (VLMs) have shown promise, a significant gap exists in their ability to perform nuanced, rubric-based assessment of handwritten mathematical arguments, a task requiring the joint interpretation of visual, textual, and symbolic modalities. This paper directly addresses the need for robust evaluation tasks in this domain. This paper introduces CHECK-MAT, a new benchmark and methodology for the automated, rubric-based assessment of handwritten mathematical solutions using Vision-Language Models (VLMs). Composed of 122 real-world solutions from a high-stakes national exam, CHECK-MAT evaluates the capacity of VLMs to emulate expert graders by identifying logical flaws and applying detailed grading rubrics. Our systematic evaluation of seven state-of-the-art VLMs serves as a direct instance of probing the mathematical understanding of state-of-the-art models. We reveal key limitations in their ability to parse complex notation and align with human grading rubrics, which we frame as a challenge in understanding the linguistic analysis of mathematical discourse. Our work contributes a robust benchmark to the NLP community and offers critical insights for developing models with more sophisticated mathematical reasoning capabilities. You can find code in https://github.com/Karifannaa/Auto-check-EGE-math."
}Markdown (Informal)
[CHECK-MAT: Probing the Mathematical Reasoning and Rubric-Alignment of Vision-Language Models on Handwritten Solutions](https://preview.aclanthology.org/ingest-emnlp/2025.mathnlp-main.6/) (Khrulev, MathNLP 2025)
ACL