@inproceedings{liu-etal-2025-interactive,
title = "Interactive Evaluation for Medical {LLM}s via Task-oriented Dialogue System",
author = "Liu, Ruoyu and
Xue, Kui and
Zhang, Xiaofan and
Zhang, Shaoting",
editor = "Rambow, Owen and
Wanner, Leo and
Apidianaki, Marianna and
Al-Khalifa, Hend and
Eugenio, Barbara Di and
Schockaert, Steven",
booktitle = "Proceedings of the 31st International Conference on Computational Linguistics",
month = jan,
year = "2025",
address = "Abu Dhabi, UAE",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/fix-sig-urls/2025.coling-main.325/",
pages = "4871--4896",
abstract = "This study focuses on evaluating proactive communication and diagnostic capabilities of medical Large Language Models (LLMs), which directly impact their effectiveness in patient consultations. In typical medical scenarios, doctors often ask a set of questions to gain a comprehensive understanding of patients' conditions. We argue that single-turn question-answering tasks such as MultiMedQA are insufficient for evaluating LLMs' medical consultation abilities. To address this limitation, we developed an evaluation benchmark called Multi-turn Medical Dialogue Evaluation (MMD-Eval), specifically designed to evaluate the proactive communication and diagnostic capabilities of medical LLMs during consultations. Considering the high cost and potential for hallucinations in LLMs, we innovatively trained a task-oriented dialogue system to simulate patients engaging in dialogues with the medical LLMs using our structured medical records dataset. This approach enabled us to generate multi-turn dialogue data. Subsequently, we evaluate the communication skills and medical expertise of the medical LLMs. All resources associated with this study will be made publicly available."
}
Markdown (Informal)
[Interactive Evaluation for Medical LLMs via Task-oriented Dialogue System](https://preview.aclanthology.org/fix-sig-urls/2025.coling-main.325/) (Liu et al., COLING 2025)
ACL