@inproceedings{gao-etal-2021-scaling,
title = "Scaling Deep Contrastive Learning Batch Size under Memory Limited Setup",
author = "Gao, Luyu and
Zhang, Yunyi and
Han, Jiawei and
Callan, Jamie",
editor = "Rogers, Anna and
Calixto, Iacer and
Vuli{\'c}, Ivan and
Saphra, Naomi and
Kassner, Nora and
Camburu, Oana-Maria and
Bansal, Trapit and
Shwartz, Vered",
booktitle = "Proceedings of the 6th Workshop on Representation Learning for NLP (RepL4NLP-2021)",
month = aug,
year = "2021",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/jlcl-multiple-ingestion/2021.repl4nlp-1.31/",
doi = "10.18653/v1/2021.repl4nlp-1.31",
pages = "316--321",
abstract = "Contrastive learning has been applied successfully to learn vector representations of text. Previous research demonstrated that learning high-quality representations benefits from batch-wise contrastive loss with a large number of negatives. In practice, the technique of in-batch negative is used, where for each example in a batch, other batch examples' positives will be taken as its negatives, avoiding encoding extra negatives. This, however, still conditions each example`s loss on all batch examples and requires fitting the entire large batch into GPU memory. This paper introduces a gradient caching technique that decouples backpropagation between contrastive loss and the encoder, removing encoder backward pass data dependency along the batch dimension. As a result, gradients can be computed for one subset of the batch at a time, leading to almost constant memory usage."
}
Markdown (Informal)
[Scaling Deep Contrastive Learning Batch Size under Memory Limited Setup](https://preview.aclanthology.org/jlcl-multiple-ingestion/2021.repl4nlp-1.31/) (Gao et al., RepL4NLP 2021)
ACL