@inproceedings{jiang-etal-2023-noisy,
title = "Noisy Self-Training with Synthetic Queries for Dense Retrieval",
author = "Jiang, Fan and
Drummond, Tom and
Cohn, Trevor",
editor = "Bouamor, Houda and
Pino, Juan and
Bali, Kalika",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2023",
month = dec,
year = "2023",
address = "Singapore",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/jlcl-multiple-ingestion/2023.findings-emnlp.803/",
doi = "10.18653/v1/2023.findings-emnlp.803",
pages = "11991--12008",
abstract = "Although existing neural retrieval models reveal promising results when training data is abundant and the performance keeps improving as training data increases, collecting high-quality annotated data is prohibitively costly. To this end, we introduce a novel noisy self-training framework combined with synthetic queries, showing that neural retrievers can be improved in a self-evolution manner with no reliance on any external models. Experimental results show that our method improves consistently over existing methods on both general-domain (e.g., MS-MARCO) and out-of-domain (i.e., BEIR) retrieval benchmarks. Extra analysis on low-resource settings reveals that our method is data efficient and outperforms competitive baselines, with as little as 30{\%} of labelled training data. Further extending the framework for reranker training demonstrates that the proposed method is general and yields additional gains on tasks of diverse domains."
}
Markdown (Informal)
[Noisy Self-Training with Synthetic Queries for Dense Retrieval](https://preview.aclanthology.org/jlcl-multiple-ingestion/2023.findings-emnlp.803/) (Jiang et al., Findings 2023)
ACL