@inproceedings{savkin-etal-2025-spy,
title = "{SPY}: Enhancing Privacy with Synthetic {PII} Detection Dataset",
author = "Savkin, Maksim and
Ionov, Timur and
Konovalov, Vasily",
editor = "Ebrahimi, Abteen and
Haider, Samar and
Liu, Emmy and
Haider, Sammar and
Leonor Pacheco, Maria and
Wein, Shira",
booktitle = "Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 4: Student Research Workshop)",
month = apr,
year = "2025",
address = "Albuquerque, USA",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/fix-sig-urls/2025.naacl-srw.23/",
pages = "236--246",
ISBN = "979-8-89176-192-6",
abstract = "We introduce **SPY Dataset**: a novel synthetic dataset for the task of **Personal Identifiable Information (PII) detection**, underscoring the significance of protecting PII in modern data processing. Our research innovates by leveraging Large Language Models (LLMs) to generate a dataset that emulates real-world PII scenarios. Through evaluation, we validate the dataset{'}s quality, providing a benchmark for PII detection. Comparative analyses reveal that while PII and Named Entity Recognition (NER) share similarities, **dedicated NER models exhibit limitations** when applied to PII-specific contexts. This work contributes to the field by making the generation methodology and the generated dataset publicly, thereby enabling further research and development in this field."
}
Markdown (Informal)
[SPY: Enhancing Privacy with Synthetic PII Detection Dataset](https://preview.aclanthology.org/fix-sig-urls/2025.naacl-srw.23/) (Savkin et al., NAACL 2025)
ACL
- Maksim Savkin, Timur Ionov, and Vasily Konovalov. 2025. SPY: Enhancing Privacy with Synthetic PII Detection Dataset. In Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 4: Student Research Workshop), pages 236–246, Albuquerque, USA. Association for Computational Linguistics.