@inproceedings{krishna-etal-2026-medact,
title = "{M}ed{A}ct: Removing the Human Bottleneck in Benchmarking Clinical {LLM} Safety",
author = "Krishna, Arjun and
Pridgen, Brian and
Silverstein, Max",
editor = "Mille, Simon and
Gehrmann, Sebastian and
Schmidtov{\'a}, Patr{\'i}cia and
Du{\v{s}}ek, Ond{\v{r}}ej and
Fadaee, Marzieh and
Lo, Kyle and
Santus, Enrico and
Stanovsky, Gabriel",
booktitle = "Proceedings of the Fifth Workshop on Generation, Evaluation and Metrics ({GEM})",
month = jul,
year = "2026",
address = "San Diego, California, USA",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl-workshops/2026.gem-main.24/",
pages = "222--230",
ISBN = "979-8-89176-423-1",
abstract = "Most medical benchmarks for large language models test factual recall through multiple-choice questions, but on-the-ground physicians do not have the luxury of four options to choose from. NOHARM (Wu et al., 2025) demonstrated this limitation using 100 real eConsult cases annotated by 29 board-certified physicians, showing that action-level evaluation reveals omission and commission failure modes invisible to multiple-choice tests. However, NOHARM{'}s cases are closed and their creation required substantial expert physician time, creating a human bottleneck that limits the scalability and openness of this evaluation approach. We present MedAct, an open replication of NOHARM{'}s evaluation methodology using synthetically generated cases. Our contribution is a multi-stage generation pipeline that uses language models grounded in clinical practice guidelines to produce 100 cases across ten specialties, each containing roughly 50 plausible next-step actions labeled as Appropriate or Inappropriate using NOHARM{'}sscoring framework. The pipeline includes structural quality controls: 83 of 100 cases pass all five automated checks, and answer-leaking language appears in only 0.06{\%} of actions. In a pilot evaluation of nine contemporary LLMs using this synthetic benchmark, we observe patterns consistent with NOHARM{'}s findings on human-curated cases, including that omissions dominate error volume while commissions dominate severe errors. We release all cases, rubrics, generation tooling, and scoring code openly, removing the human-bottleneck barrier to action-level clinical LLM evaluation."
}Markdown (Informal)
[MedAct: Removing the Human Bottleneck in Benchmarking Clinical LLM Safety](https://preview.aclanthology.org/ingest-acl-workshops/2026.gem-main.24/) (Krishna et al., GEM 2026)
ACL