@inproceedings{hasan-etal-2026-beyond,
title = "Beyond Lexical Similarity: Evaluating Faithfulness in {LLM}-Based Medical Question Reformulation",
author = "Hasan, Md Rabiul and
Ayalew, Aleka Melese and
Oussalah, Mourad",
editor = "Lopez-Garcia, Guillermo and
Gonzalez-Hernandez, Graciela",
booktitle = "Proceedings of the 11th Social Media Mining for Health Research and Applications ({SMM}4{H}-{H}ea{RD} 2026) Workshop and Shared Tasks",
month = jul,
year = "2026",
address = "San Diego, United States",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl-workshops/2026.smm4h-1.16/",
pages = "93--102",
ISBN = "979-8-89176-432-3",
abstract = "Medical query rewriting transforms verbose consumer health questions into concise clinical queries, a critical step in health information retrieval. Large language models (LLMs) perform well on this task by standard metrics, yet high ROUGE or BERTScore does not guarantee preservation of clinical content. To address this issue, we introduce MedFaith-F1, a category-level faithfulness metric over four clinically salient categories: diagnoses, medications, procedures, and follow-up intent. We further propose a hybrid Evidence and Knowledge-Grounded Retrieval-Augmented Generation EKG-RAG, an evidence and knowledge-grounded framework combining hybrid retrieval over PubMed and MedlinePlus resources with UMLS (Unified Medical Language System)-aligned ontology grounding. Evaluating large language models LLaMA-3 and Qwen2.5 across zero-shot, few-shot, and QLoRA settings on MeQSum and medical question-pair (MQP) datasets revealed that base models exhibit category-level hallucination rates exceeding 40{\%}, invisible to standard metrics, while EKG-RAG with QLoRA reduces this rate to 26.75{\%}, achieving MedFaith-F1 of 0.73. Our findings call for faithfulness-aware evaluation in clinical query rewriting, and MedFaith-F1 provides a reproducible step in that direction."
}Markdown (Informal)
[Beyond Lexical Similarity: Evaluating Faithfulness in LLM-Based Medical Question Reformulation](https://preview.aclanthology.org/ingest-acl-workshops/2026.smm4h-1.16/) (Hasan et al., SMM4H 2026)
ACL