@inproceedings{yim-etal-2024-err,
title = "To Err Is Human, How about Medical Large Language Models? Comparing Pre-trained Language Models for Medical Assessment Errors and Reliability",
author = "Yim, Wen-wai and
Fu, Yujuan and
Ben Abacha, Asma and
Yetisgen, Meliha",
editor = "Calzolari, Nicoletta and
Kan, Min-Yen and
Hoste, Veronique and
Lenci, Alessandro and
Sakti, Sakriani and
Xue, Nianwen",
booktitle = "Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)",
month = may,
year = "2024",
address = "Torino, Italia",
publisher = "ELRA and ICCL",
url = "https://preview.aclanthology.org/fix-sig-urls/2024.lrec-main.1409/",
pages = "16211--16223",
abstract = "Unpredictability, especially unpredictability with unknown error characteristics, is a highly undesirable trait, particularly in medical patient care applications. Although large pre-trained language models (LLM) have been applied to a variety of unseen tasks with highly competitive and successful results, their sensitivity to language inputs and resulting performance variability is not well-studied. In this work, we test state-of-the-art pre-trained language models from a variety of families to characterize their error generation and reliability in medical assessment ability. Particularly, we experiment with general medical assessment multiple choice tests, as well as their open-ended and true-false alternatives. We also profile model consistency, error agreements with each other and to humans; and finally, quantify their ability to recover and explain errors. The findings in this work can be used to give further information about medical models so that modelers can make better-informed decisions rather than relying on standalone performance metrics alone."
}
Markdown (Informal)
[To Err Is Human, How about Medical Large Language Models? Comparing Pre-trained Language Models for Medical Assessment Errors and Reliability](https://preview.aclanthology.org/fix-sig-urls/2024.lrec-main.1409/) (Yim et al., LREC-COLING 2024)
ACL