@inproceedings{lan-etal-2025-llave,
title = "{LL}a{VE}: Large Language and Vision Embedding Models with Hardness-Weighted Contrastive Learning",
author = "Lan, Zhibin and
Niu, Liqiang and
Meng, Fandong and
Zhou, Jie and
Su, Jinsong",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2025",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/author-page-yu-wang-polytechnic/2025.findings-emnlp.740/",
doi = "10.18653/v1/2025.findings-emnlp.740",
pages = "13721--13735",
ISBN = "979-8-89176-335-7",
abstract = "Universal multimodal embedding models play a critical role in tasks such as interleaved image-text retrieval, multimodal RAG, and multimodal clustering. However, our empirical results indicate that existing LMM-based embedding models trained with the standard InfoNCE loss exhibit a high degree of overlap in similarity distribution between positive and negative pairs, making it challenging to distinguish hard negative pairs effectively. To deal with this issue, we propose a simple yet effective framework that dynamically improves the embedding model{'}s representation learning for negative pairs based on their discriminative difficulty. Within this framework, we train a series of models, named LLaVE, and evaluate them on the MMEB benchmark, which covers 4 meta-tasks and 36 datasets. Experimental results show that LLaVE establishes stronger baselines that achieve state-of-the-art (SOTA) performance while demonstrating strong scalability and efficiency. Specifically, LLaVE-2B surpasses the previous SOTA 7B models, while LLaVE-7B achieves a further performance improvement of 6.2 points. Although LLaVE is trained on image-text data, it can generalize to text-video retrieval tasks in a zero-shot manner and achieve strong performance, demonstrating its remarkable potential for transfer to other embedding tasks."
}Markdown (Informal)
[LLaVE: Large Language and Vision Embedding Models with Hardness-Weighted Contrastive Learning](https://preview.aclanthology.org/author-page-yu-wang-polytechnic/2025.findings-emnlp.740/) (Lan et al., Findings 2025)
ACL