@inproceedings{wang-etal-2024-imapscore,
title = "imap{S}core: Medical Fact Evaluation Made Easy",
author = "Wang, Huimin and
Zhao, Yutian and
Wu, Xian and
Zheng, Yefeng",
editor = "Ku, Lun-Wei and
Martins, Andre and
Srikumar, Vivek",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2024",
month = aug,
year = "2024",
address = "Bangkok, Thailand",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/Ingest-2025-COMPUTEL/2024.findings-acl.610/",
doi = "10.18653/v1/2024.findings-acl.610",
pages = "10242--10257",
abstract = "Automatic evaluation of natural language generation (NLG) tasks has gained extensive research interests, since it can rapidly assess the performance of large language models (LLMs). However, automatic NLG evaluation struggles with medical QA because it fails to focus on the crucial correctness of medical facts throughout the generated text. To address this, this paper introduces a new data structure, \textit{imap}, designed to capture key information in questions and answers, enabling evaluators to focus on essential details. The \textit{imap} comprises three components: Query, Constraint, and Inform, each of which is in the form of term-value pairs to represent medical facts in a structural manner. We then introduce \textit{imap}Score, which compares the corresponding medical term-value pairs in the \textit{imap} to score generated texts. We utilize GPT-4 to extract \textit{imap} from questions, human-annotated answers, and generated responses. To mitigate the diversity in medical terminology for fair term-value pairs comparison, we use a medical knowledge graph to assist GPT-4 in determining matches. To compare \textit{imap}Score with existing NLG metrics, we establish a new benchmark dataset. The experimental results show that \textit{imap}Score consistently outperforms state-of-the-art metrics, demonstrating an average improvement of 79.8{\%} in correlation with human scores. Furthermore, incorporating \textit{imap} into n-gram, embedding, and LLM metrics boosts the base versions, increasing correlation with human scores by averages of 89.9{\%}, 81.7{\%}, and 32.6{\%}, respectively."
}
Markdown (Informal)
[imapScore: Medical Fact Evaluation Made Easy](https://preview.aclanthology.org/Ingest-2025-COMPUTEL/2024.findings-acl.610/) (Wang et al., Findings 2024)
ACL
- Huimin Wang, Yutian Zhao, Xian Wu, and Yefeng Zheng. 2024. imapScore: Medical Fact Evaluation Made Easy. In Findings of the Association for Computational Linguistics: ACL 2024, pages 10242–10257, Bangkok, Thailand. Association for Computational Linguistics.