@inproceedings{bartels-etal-2025-large,
title = "Can Large Language Models Accurately Generate Answer Keys for Health-related Questions?",
author = "Bartels, Davis and
Gupta, Deepak and
Demner-Fushman, Dina",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers)",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/landing_page/2025.acl-short.28/",
pages = "354--368",
ISBN = "979-8-89176-252-7",
abstract = "The evaluation of text generated by LLMs remains a challenge for question answering, retrieval augmented generation (RAG), summarization, and many other natural language processing tasks. Evaluating the factuality of LLM generated responses is particularly important in medical question answering, where the stakes are high. One method of evaluating the factuality of text is through the use of information nuggets (answer keys). Nuggets are text representing atomic facts that may be used by an assessor to make a binary decision as to whether the fact represented by said nugget is contained in an answer. Although manual nugget extraction is expensive and time-consuming, recent RAG shared task evaluations have explored automating the nuggetization of text with LLMs. In this work, we explore several approaches to nugget generation for medical question answering and evaluate their alignment with expert human nugget generation. We find providing an example and extracting nuggets from an answer to be the best approach to nuggetization. While, overall, we found the capabilities of LLMs to distill atomic facts limited, Llama 3.3 performed the best out of the models we tested."
}
Markdown (Informal)
[Can Large Language Models Accurately Generate Answer Keys for Health-related Questions?](https://preview.aclanthology.org/landing_page/2025.acl-short.28/) (Bartels et al., ACL 2025)
ACL