@inproceedings{zhang-etal-2025-cmedcalc,
title = "{CM}ed{C}alc-Bench: A Fine-Grained Benchmark for {C}hinese Medical Calculations in {LLM}",
author = "Zhang, Yunyan and
Zhu, Zhihong and
Wu, Xian",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-emnlp/2025.emnlp-main.1302/",
pages = "25661--25670",
ISBN = "979-8-89176-332-6",
abstract = "Large Language Models (LLMs) have demonstrated significant potential in medical diagnostics and clinical decision-making. While benchmarks such as MedQA and PubMedQA have advanced the evaluation of qualitative reasoning, existing medical NLP benchmarks still face two limitations: the absence of a Chinese benchmark for medical calculation tasks, and the lack of fine-grained evaluation of intermediate reasoning. In this paper, we introduce CMedCalc-Bench, a new benchmark designed for Chinese medical calculation. CMedCalc-Bench covers 69 calculators across 12 clinical departments, featuring over 1,000 real-world patient cases. Building on this, we design a fine-grained evaluation framework that disentangles clinical entity extraction from numerical computation, enabling systematic diagnosis of model deficiencies. Experiments across four model families, including medical-specialized and reasoning-focused, provide an assessment of their strengths and limitations on Chinese medical calculation. Furthermore, explorations on faithful reasoning and the demonstration effect offer early insights into advancing safe and reliable clinical computation."
}Markdown (Informal)
[CMedCalc-Bench: A Fine-Grained Benchmark for Chinese Medical Calculations in LLM](https://preview.aclanthology.org/ingest-emnlp/2025.emnlp-main.1302/) (Zhang et al., EMNLP 2025)
ACL