@inproceedings{wang-etal-2025-raresyn,
title = "{R}are{S}yn: Health Record Synthesis for Rare Disease Diagnosis",
author = "Wang, Huimin and
Zhao, Yutian and
Zheng, Yefeng and
Wu, Xian",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-emnlp/2025.emnlp-main.620/",
pages = "12322--12338",
ISBN = "979-8-89176-332-6",
abstract = "Diagnosis based on Electronic Health Records (EHRs) often struggles with data scarcity and privacy concerns. To address these issues, we introduce RareSyn, an innovative data synthesis approach designed to augment and de-identify EHRs, with a focus on rare diseases. The core insight of RareSyn involves using seed EHRs of rare diseases to recall similar records from both common and rare diseases, and then leveraging Large Language Models to substitute the key medical information (e.g., symptoms or examination details) in these records with information from the knowledge graph, thereby generating new EHRs. We first train a transformer Encoder with contrastive learning to integrate various types of medical knowledge. Then, RareSyn engages in iterative processes of recalling similar EHRs, structuring EHRs, revising EHRs, and generating new EHRs until the produced EHRs achieve extensive coverage of the rare disease knowledge. We assess RareSyn based on its utility for diagnosis modeling, the diversity of medical knowledge it incorporates, and the privacy of the synthesized EHRs. Extensive experiments demonstrate its effectiveness in improving disease diagnosis, enhancing diversity, and maintaining privacy."
}Markdown (Informal)
[RareSyn: Health Record Synthesis for Rare Disease Diagnosis](https://preview.aclanthology.org/ingest-emnlp/2025.emnlp-main.620/) (Wang et al., EMNLP 2025)
ACL