@inproceedings{kim-etal-2023-automatic,
title = "Automatic Creation of Named Entity Recognition Datasets by Querying Phrase Representations",
author = "Kim, Hyunjae and
Yoo, Jaehyo and
Yoon, Seunghyun and
Kang, Jaewoo",
editor = "Rogers, Anna and
Boyd-Graber, Jordan and
Okazaki, Naoaki",
booktitle = "Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = jul,
year = "2023",
address = "Toronto, Canada",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest_wac_2008/2023.acl-long.394/",
doi = "10.18653/v1/2023.acl-long.394",
pages = "7148--7163",
abstract = "Most weakly supervised named entity recognition (NER) models rely on domain-specific dictionaries provided by experts. This approach is infeasible in many domains where dictionaries do not exist. While a phrase retrieval model was used to construct pseudo-dictionaries with entities retrieved from Wikipedia automatically in a recent study, these dictionaries often have limited coverage because the retriever is likely to retrieve popular entities rather than rare ones. In this study, we present a novel framework, HighGEN, that generates NER datasets with high-coverage pseudo-dictionaries. Specifically, we create entity-rich dictionaries with a novel search method, called phrase embedding search, which encourages the retriever to search a space densely populated with various entities. In addition, we use a new verification process based on the embedding distance between candidate entity mentions and entity types to reduce the false-positive noise in weak labels generated by high-coverage dictionaries. We demonstrate that HighGEN outperforms the previous best model by an average F1 score of 4.7 across five NER benchmark datasets."
}
Markdown (Informal)
[Automatic Creation of Named Entity Recognition Datasets by Querying Phrase Representations](https://preview.aclanthology.org/ingest_wac_2008/2023.acl-long.394/) (Kim et al., ACL 2023)
ACL