@inproceedings{sato-etal-2024-improving,
title = "Improving Sentence Embeddings with Automatic Generation of Training Data Using Few-shot Examples",
author = "Sato, Soma and
Tsukagoshi, Hayato and
Sasano, Ryohei and
Takeda, Koichi",
editor = "Fu, Xiyan and
Fleisig, Eve",
booktitle = "Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 4: Student Research Workshop)",
month = aug,
year = "2024",
address = "Bangkok, Thailand",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/fix-sig-urls/2024.acl-srw.43/",
doi = "10.18653/v1/2024.acl-srw.43",
pages = "378--389",
ISBN = "979-8-89176-097-4",
abstract = "Decoder-based large language models (LLMs) have shown high performance on many tasks in natural language processing. This is also true for sentence embedding learning, where a decoder-based model, PromptEOL, has achieved the best performance on semantic textual similarity (STS) tasks. However, PromptEOL requires a manually annotated natural language inference (NLI) dataset for fine-tuning.We aim to improve sentence embeddings without using large manually annotated datasets by automatically generating an NLI dataset with an LLM and using it for fine-tuning of PromptEOL. To achieve this, we explore methods of data generation suitable for sentence embedding learning in this study. Specifically, we will focus on automatic dataset generation through few-shot learning and explore the appropriate methods to leverage few-shot examples. Experimental results on the STS tasks demonstrate that our approach outperforms existing models in settings without large manually annotated datasets."
}
Markdown (Informal)
[Improving Sentence Embeddings with Automatic Generation of Training Data Using Few-shot Examples](https://preview.aclanthology.org/fix-sig-urls/2024.acl-srw.43/) (Sato et al., ACL 2024)
ACL