@inproceedings{jensen-etal-2021-de,
title = "De-identification of Privacy-related Entities in Job Postings",
author = "Jensen, Kristian N{\o}rgaard and
Zhang, Mike and
Plank, Barbara",
editor = "Dobnik, Simon and
{\O}vrelid, Lilja",
booktitle = "Proceedings of the 23rd Nordic Conference on Computational Linguistics (NoDaLiDa)",
month = may # " 31--2 " # jun,
year = "2021",
address = "Reykjavik, Iceland (Online)",
publisher = {Link{\"o}ping University Electronic Press, Sweden},
url = "https://preview.aclanthology.org/jlcl-multiple-ingestion/2021.nodalida-main.21/",
pages = "210--221",
abstract = "De-identification is the task of detecting privacy-related entities in text, such as person names, emails and contact data. It has been well-studied within the medical domain. The need for de-identification technology is increasing, as privacy-preserving data handling is in high demand in many domains. In this paper, we focus on job postings. We present JobStack, a new corpus for de-identification of personal data in job vacancies on Stackoverflow. We introduce baselines, comparing Long-Short Term Memory (LSTM) and Transformer models. To improve these baselines, we experiment with BERT representations, and distantly related auxiliary data via multi-task learning. Our results show that auxiliary data helps to improve de-identification performance. While BERT representations improve performance, surprisingly {\textquotedblleft}vanilla{\textquotedblright} BERT turned out to be more effective than BERT trained on Stackoverflow-related data."
}
Markdown (Informal)
[De-identification of Privacy-related Entities in Job Postings](https://preview.aclanthology.org/jlcl-multiple-ingestion/2021.nodalida-main.21/) (Jensen et al., NoDaLiDa 2021)
ACL