@inproceedings{gupta-gupta-2021-unsupervised,
title = "Unsupervised Contextualized Document Representation",
author = "Gupta, Ankur and
Gupta, Vivek",
editor = "Moosavi, Nafise Sadat and
Gurevych, Iryna and
Fan, Angela and
Wolf, Thomas and
Hou, Yufang and
Marasovi{\'c}, Ana and
Ravi, Sujith",
booktitle = "Proceedings of the Second Workshop on Simple and Efficient Natural Language Processing",
month = nov,
year = "2021",
address = "Virtual",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/fix-sig-urls/2021.sustainlp-1.17/",
doi = "10.18653/v1/2021.sustainlp-1.17",
pages = "166--173",
abstract = "Several NLP tasks need the effective repre-sentation of text documents.Arora et al.,2017 demonstrate that simple weighted aver-aging of word vectors frequently outperformsneural models. SCDV (Mekala et al., 2017)further extends this from sentences to docu-ments by employing soft and sparse cluster-ing over pre-computed word vectors. How-ever, both techniques ignore the polysemyand contextual character of words. In thispaper, we address this issue by proposingSCDV+BERT(ctxd), a simple and effective un-supervised representation that combines con-textualized BERT (Devlin et al., 2019) basedword embedding for word sense disambigua-tion with SCDV soft clustering approach. Weshow that our embeddings outperform origi-nal SCDV, pre-train BERT, and several otherbaselines on many classification datasets. Wealso demonstrate our embeddings effective-ness on other tasks, such as concept match-ing and sentence similarity. In addition,we show that SCDV+BERT(ctxd) outperformsfine-tune BERT and different embedding ap-proaches in scenarios with limited data andonly few shots examples."
}
Markdown (Informal)
[Unsupervised Contextualized Document Representation](https://preview.aclanthology.org/fix-sig-urls/2021.sustainlp-1.17/) (Gupta & Gupta, sustainlp 2021)
ACL