@inproceedings{li-etal-2024-leveraging,
title = "Leveraging {BERT} and {TFIDF} Features for Short Text Clustering via Alignment-Promoting Co-Training",
author = "Li, Zetong and
Su, Qinliang and
Si, Shijing and
Yu, Jianxing",
editor = "Al-Onaizan, Yaser and
Bansal, Mohit and
Chen, Yun-Nung",
booktitle = "Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/jlcl-multiple-ingestion/2024.emnlp-main.828/",
doi = "10.18653/v1/2024.emnlp-main.828",
pages = "14897--14913",
abstract = "BERT and TFIDF features excel in capturing rich semantics and important words, respectively. Since most existing clustering methods are solely based on the BERT model, they often fall short in utilizing keyword information, which, however, is very useful in clustering short texts. In this paper, we propose a **CO**-**T**raining **C**lustering (**COTC**) framework to make use of the collective strengths of BERT and TFIDF features. Specifically, we develop two modules responsible for the clustering of BERT and TFIDF features, respectively. We use the deep representations and cluster assignments from the TFIDF module outputs to guide the learning of the BERT module, seeking to align them at both the representation and cluster levels. Reversely, we also use the BERT module outputs to train the TFIDF module, thus leading to the mutual promotion. We then show that the alternating co-training framework can be placed under a unified joint training objective, which allows the two modules to be connected tightly and the training signals to be propagated efficiently. Experiments on eight benchmark datasets show that our method outperforms current SOTA methods significantly."
}
Markdown (Informal)
[Leveraging BERT and TFIDF Features for Short Text Clustering via Alignment-Promoting Co-Training](https://preview.aclanthology.org/jlcl-multiple-ingestion/2024.emnlp-main.828/) (Li et al., EMNLP 2024)
ACL