@inproceedings{li-etal-2025-automir,
title = "{A}uto{MIR}: Effective Zero-Shot Medical Information Retrieval without Relevance Labels",
author = "Li, Lei and
Zhang, Xiangxu and
Zhou, Xiao and
Liu, Zheng",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2025",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/author-page-yu-wang-polytechnic/2025.findings-emnlp.1305/",
doi = "10.18653/v1/2025.findings-emnlp.1305",
pages = "24028--24047",
ISBN = "979-8-89176-335-7",
abstract = "Medical information retrieval (MIR) is vital for accessing knowledge from electronic health records, scientific literature, and medical databases, supporting applications such as medical education, patient queries, and clinical diagnosis. However, effective zero-shot dense retrieval in the medical domain remains difficult due to the scarcity of relevance-labeled data. To address this challenge, we propose **S**elf-**L**earning **Hy**pothetical **D**ocument **E**mbeddings (**SL-HyDE**), a framework that leverages large language models (LLMs) to generate hypothetical documents conditioned on a query. These documents encapsulate essential medical context, guiding dense retrievers toward the most relevant results. SL-HyDE further employs a self-learning mechanism that iteratively improves pseudo-document generation and retrieval using unlabeled corpora, eliminating the need for labeled data. In addition, we introduce the Chinese Medical Information Retrieval Benchmark (CMIRB), a comprehensive evaluation suite reflecting real-world medical scenarios, comprising five tasks and ten datasets. By benchmarking ten models on CMIRB, we provide a rigorous standard for evaluating MIR systems. Experimental results demonstrate that SL-HyDE significantly outperforms HyDE in retrieval accuracy, while exhibiting strong generalization and scalability across diverse LLM and retriever configurations. Our code and data are publicly available at: https://github.com/ll0ruc/AutoMIR."
}Markdown (Informal)
[AutoMIR: Effective Zero-Shot Medical Information Retrieval without Relevance Labels](https://preview.aclanthology.org/author-page-yu-wang-polytechnic/2025.findings-emnlp.1305/) (Li et al., Findings 2025)
ACL