@inproceedings{lee-hockenmaier-2025-evaluating,
title = "Evaluating Step-by-step Reasoning Traces: A Survey",
author = "Lee, Jinu and
Hockenmaier, Julia",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2025",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/author-page-yu-wang-polytechnic/2025.findings-emnlp.94/",
doi = "10.18653/v1/2025.findings-emnlp.94",
pages = "1789--1814",
ISBN = "979-8-89176-335-7",
abstract = "Step-by-step reasoning is widely used to enhance the reasoning ability of large language models (LLMs) in complex problems. Evaluating the quality of reasoning traces is crucial for understanding and improving LLM reasoning. However, existing evaluation practices are highly inconsistent, resulting in fragmented progress across evaluator design and benchmark development. To address this gap, this survey provides a comprehensive overview of step-by-step reasoning evaluation, proposing a taxonomy of evaluation criteria with four top-level categories (factuality, validity, coherence, and utility). Based on the taxonomy, we review different datasets, evaluator implementations, and recent findings, leading to promising directions for future research."
}Markdown (Informal)
[Evaluating Step-by-step Reasoning Traces: A Survey](https://preview.aclanthology.org/author-page-yu-wang-polytechnic/2025.findings-emnlp.94/) (Lee & Hockenmaier, Findings 2025)
ACL