@inproceedings{wang-etal-2026-training-biomedical,
title = "Training Biomedical Retrievers From Large-Scale Citation Contexts",
author = "Wang, Xing David and
Le Thanh, Duy and
Leser, Ulf",
editor = "Demner-Fushman, Dina and
Ananiadou, Sophia and
Roberts, Kirk and
Tsujii, Junichi",
booktitle = "{B}io{NLP} 2026",
month = jul,
year = "2026",
address = "San Diego, California",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl-workshops/2026.bionlp-1.7/",
pages = "75--83",
ISBN = "979-8-89176-434-7",
abstract = "The MedCPT model has demonstrated that strong biomedical retrievers can be trained using proprietary PubMed search logs. In this work, we study whether freely available citation sentences are sufficient to train similarly effective models. We construct a large-scale training dataset of {\textasciitilde} 62 million citation sentence-abstract pairs extracted from PubMed Central. We train a lightweight BERT-based retriever-reranker model called CiteRec on this dataset and evaluate it across three benchmark settings: (a) the biomedical subset of BEIR for information retrieval, (b) SciRepEval for generalizable scientific document embeddings, and (c) CitancePlus, a new set of {\textasciitilde} 90 thousand citation sentence-abstract pairs for PubMed-scale citation recommendation. We show that CiteRec performs competitively with MedCPT on the biomedical BEIR subset and outperforms it on SciRepEval. On CitancePlus, CiteRec achieves strong performance for citation recommendation over the full PubMed corpus, outperforming both MedCPT and a substantially larger Qwen3-Embedding-8B retriever."
}Markdown (Informal)
[Training Biomedical Retrievers From Large-Scale Citation Contexts](https://preview.aclanthology.org/ingest-acl-workshops/2026.bionlp-1.7/) (Wang et al., BioNLP 2026)
ACL