@inproceedings{wang-etal-2024-vaegpt,
title = "{VAEGPT}-Sim: Improving Sentence Representation with Limited Corpus Using Gradually-Denoising {VAE}",
author = "Wang, Zhenyi and
Ning, Haiyan and
Ling, Qing and
Wang, Dan",
editor = "Ku, Lun-Wei and
Martins, Andre and
Srikumar, Vivek",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2024",
month = aug,
year = "2024",
address = "Bangkok, Thailand",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/fix-sig-urls/2024.findings-acl.513/",
doi = "10.18653/v1/2024.findings-acl.513",
pages = "8666--8681",
abstract = "Text embedding requires a highly efficient method for training domain-specific models on limited data, as general models trained on large corpora lack universal applicability in highly specific fields. Therefore, we have introduced VAEGPT-Sim, an innovative model for generating synonyms that combines a denoising variational autoencoder with a target-specific discriminator to generate synonymous sentences that closely resemble human language. Even when trained with completely unsupervised settings, it maintains a harmonious balance between semantic similarity and lexical diversity, as shown by a comprehensive evaluation metric system with the highest average scores compared to other generative models. When VAEGPT-Sim is utilized as a module for contrastive learning in text representation, it delivers state-of-the-art results in small-dataset training on STS benchmarks, surpassing ConSERT by 2.8 points. This approach optimizes the effectiveness of text representation despite a limited corpus, signifying an advancement in domain-specific embedding technology."
}
Markdown (Informal)
[VAEGPT-Sim: Improving Sentence Representation with Limited Corpus Using Gradually-Denoising VAE](https://preview.aclanthology.org/fix-sig-urls/2024.findings-acl.513/) (Wang et al., Findings 2024)
ACL