@inproceedings{ma-etal-2026-vericite,
title = "{VERICITE}: Evaluating Sentence-Level Citation Faithfulness in Retrieval-Augmented Medical Question Answering",
author = "Ma, Yixian and
Chu, Bohao and
Fuhr, Norbert",
editor = "Demner-Fushman, Dina and
Ananiadou, Sophia and
Roberts, Kirk and
Tsujii, Junichi",
booktitle = "{B}io{NLP} 2026",
month = jul,
year = "2026",
address = "San Diego, California",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl-workshops/2026.bionlp-1.62/",
pages = "753--759",
ISBN = "979-8-89176-434-7",
abstract = "Retrieval-augmented generation (RAG) reduces hallucination in large language models by grounding outputs in retrieved evidence, but it does not guarantee that the resulting citations support the associated claims. We present VERICITE, a framework for evaluating citation faithfulness in retrieval-augmented medical QA. Our system retrieves PubMed abstracts via the NCBI E-Utilities API, prompts LLMs to generate answers with inline citations, and verifies each citation at the sentence level using a DeBERTa-v3-large NLI model. We evaluate four LLMs on 500 BioASQ questions at retrieval depths of 3 and 5, with extended experiments up to k = 15 and an oracle setting with gold standard documents. Only 27?41{\%} of citation pairs are supported at the sentence level at retrieval depths of 3 and 5, with support rates declining further at larger k. Under the oracle condition, answer quality improves, but citation faithfulness does not substantially improve, suggesting that generation-side citation behavior contributes substantially to unfaithful citations."
}Markdown (Informal)
[VERICITE: Evaluating Sentence-Level Citation Faithfulness in Retrieval-Augmented Medical Question Answering](https://preview.aclanthology.org/ingest-acl-workshops/2026.bionlp-1.62/) (Ma et al., BioNLP 2026)
ACL