@inproceedings{vakili-etal-2025-data,
title = "Data-Constrained Synthesis of Training Data for De-Identification",
author = "Vakili, Thomas and
Henriksson, Aron and
Dalianis, Hercules",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingestion-acl-25/2025.acl-long.1329/",
pages = "27414--27427",
ISBN = "979-8-89176-251-0",
abstract = "Many sensitive domains {---} such as the clinical domain {---} lack widely available datasets due to privacy risks. The increasing generative capabilities of large language models (LLMs) have made synthetic datasets a viable path forward. In this study, we domain-adapt LLMs to the clinical domain and generate synthetic clinical texts that are machine-annotated with tags for personally identifiable information using capable encoder-based NER models. The synthetic corpora are then used to train synthetic NER models. The results show that training NER models using synthetic corpora incurs only a small drop in predictive performance. The limits of this process are investigated in a systematic ablation study {---} using both Swedish and Spanish data. Our analysis shows that smaller datasets can be sufficient for domain-adapting LLMs for data synthesis. Instead, the effectiveness of this process is almost entirely contingent on the performance of the machine-annotating NER models trained using the original data."
}
Markdown (Informal)
[Data-Constrained Synthesis of Training Data for De-Identification](https://preview.aclanthology.org/ingestion-acl-25/2025.acl-long.1329/) (Vakili et al., ACL 2025)
ACL