@inproceedings{roy-roy-2026-towards,
title = "Towards Unified Factuality Evaluation for Biomedical {QA} and Summarization: Aligning Metrics with Clinical Use-Cases",
author = "Roy, Mahule and
Roy, Subhas",
editor = "Demner-Fushman, Dina and
Ananiadou, Sophia and
Roberts, Kirk and
Tsujii, Junichi",
booktitle = "{B}io{NLP} 2026",
month = jul,
year = "2026",
address = "San Diego, California",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl-workshops/2026.bionlp-1.2/",
pages = "15--19",
ISBN = "979-8-89176-434-7",
abstract = "Large language models achieve strong performance on biomedical question answering and summarization benchmarks, yet traditional evaluation metrics often fail to detect clinically significant factual errors. We introduce a unified evaluation framework that combines reference-based measures with evidence-grounded factuality verification to assess biomedical text generation. Evaluating four open-source models across three benchmarks (BioASQ, PubMedQA, MedLFQA), we find that 13.4?24.7{\%} of generated claims are contradicted and 23?41{\%} are unsupported, despite high lexical overlap scores. Our proposed Fact-Aligned Score (FAS) correlates strongly with claim-level verifiability (rho=0.68), substantially outperforming ROUGE-L (rho=0.41). We release an open-source toolkit with model outputs and analysis scripts to support reproducible factuality evaluation and safer deployment of biomedical LLMs."
}Markdown (Informal)
[Towards Unified Factuality Evaluation for Biomedical QA and Summarization: Aligning Metrics with Clinical Use-Cases](https://preview.aclanthology.org/ingest-acl-workshops/2026.bionlp-1.2/) (Roy & Roy, BioNLP 2026)
ACL