@inproceedings{wang-etal-2020-learning,
title = "Learning from Unlabelled Data for Clinical Semantic Textual Similarity",
author = "Wang, Yuxia and
Verspoor, Karin and
Baldwin, Timothy",
editor = "Rumshisky, Anna and
Roberts, Kirk and
Bethard, Steven and
Naumann, Tristan",
booktitle = "Proceedings of the 3rd Clinical Natural Language Processing Workshop",
month = nov,
year = "2020",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/jlcl-multiple-ingestion/2020.clinicalnlp-1.25/",
doi = "10.18653/v1/2020.clinicalnlp-1.25",
pages = "227--233",
abstract = "Domain pretraining followed by task fine-tuning has become the standard paradigm for NLP tasks, but requires in-domain labelled data for task fine-tuning. To overcome this, we propose to utilise domain unlabelled data by assigning pseudo labels from a general model. We evaluate the approach on two clinical STS datasets, and achieve r= 0.80 on N2C2-STS. Further investigation reveals that if the data distribution of unlabelled sentence pairs is closer to the test data, we can obtain better performance. By leveraging a large general-purpose STS dataset and small-scale in-domain training data, we obtain further improvements to r= 0.90, a new SOTA."
}
Markdown (Informal)
[Learning from Unlabelled Data for Clinical Semantic Textual Similarity](https://preview.aclanthology.org/jlcl-multiple-ingestion/2020.clinicalnlp-1.25/) (Wang et al., ClinicalNLP 2020)
ACL