@inproceedings{liu-etal-2026-bigger,
title = "A Bigger Catch: Fine-Grained Curriculum Standards Alignment on the {M}ath{F}ish Benchmark",
author = "Liu, Xinman and
Sharma, Mayank and
Shi, Xinyu",
editor = "Kochmar, Ekaterina and
Alhafni, Bashar and
Bann{\`o}, Stefano and
Bexte, Marie and
Burstein, Jill and
Horbach, Andrea and
Laarmann-Quante, Ronja and
Tack, Anais and
Yaneva, Victoria and
Yuan, Zheng",
booktitle = "Proceedings of the 21st Workshop on Innovative Use of {NLP} for Building Educational Applications ({BEA} 2026)",
month = jul,
year = "2026",
address = "San Diego, California, USA",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl-workshops/2026.bea-1.15/",
pages = "208--220",
ISBN = "979-8-89176-409-5",
abstract = "Most existing math benchmarks for LLMs focus on evaluating whether models produce correct solutions. In educational settings, however, it is equally important to understand whether LLMs grasp the pedagogical intent behind math problems, beyond simply arriving at the right answer. Tagging curriculum standards is challenging for the same reason: distinguishing fine-grained standards requires understanding subtle pedagogical distinctions. In this paper, we use the MathFish benchmark, which frames curriculum alignment as a multi-label prediction task over 385 Common Core State Standards, to evaluate a three-stage pipeline inspired by observed failure modes in retrieval and structural reasoning: curriculum-informed hard negatives (M1), a cross-encoder reranker (M2), and a ReAct agent paired with an LLM-as-a-judge critic (M3). We additionally evaluate a training-free alternative (A1) that combines hybrid sparse-dense retrieval with curriculum-graph reranking. M3 achieves 31.3{\%} exact-match accuracy, approximately 6.5{\texttimes} higher than the three-shot GPT-4-Turbo baseline. Error analysis shows that, despite these improvements, the pipeline still struggles with missing predictions, grade-level misalignment, and sibling-standard confusion, reinforcing that precise curriculum alignment remains a fundamentally difficult problem in educational NLP."
}Markdown (Informal)
[A Bigger Catch: Fine-Grained Curriculum Standards Alignment on the MathFish Benchmark](https://preview.aclanthology.org/ingest-acl-workshops/2026.bea-1.15/) (Liu et al., BEA 2026)
ACL