@inproceedings{geigle-etal-2024-mblip,
title = "m{BLIP}: Efficient Bootstrapping of Multilingual Vision-{LLM}s",
author = "Geigle, Gregor and
Jain, Abhay and
Timofte, Radu and
Glava{\v{s}}, Goran",
editor = "Gu, Jing and
Fu, Tsu-Jui (Ray) and
Hudson, Drew and
Celikyilmaz, Asli and
Wang, William",
booktitle = "Proceedings of the 3rd Workshop on Advances in Language and Vision Research (ALVR)",
month = aug,
year = "2024",
address = "Bangkok, Thailand",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.alvr-1.2",
pages = "7--25",
abstract = "Modular vision-language models (Vision-LLMs) align pretrained image encoders with (frozen) large language models (LLMs) and post-hoc condition LLMs to {`}understand{'} the image input. With the abundance of readily available high-quality English image-text data as well as strong monolingual English LLMs, the research focus has been on English-only Vision-LLMs. Multilingual vision-language models are still predominantly obtained via expensive end-to-end pretraining, resulting in comparatively smaller models, trained on limited multilingual image data supplemented with text-only multilingual corpora. We present mBLIP, the first Vision-LLM leveraging multilingual LLMs, which we obtain in a computationally efficient manner on consumer-level hardware. To this end, we \textit{re-align} an image encoder previously tuned to an English LLM to a new, multilingual LLM using only a few million multilingual training examples derived from a mix of vision-and-language tasks, which we obtain by machine-translating high-quality English data to 95 languages. On the IGLUE benchmark and XM3600, mBLIP yields results competitive with state-of-the-art models and it greatly outperforms strong English-only Vision-LLMs like Llava 1.5. We release our model, code, and train data at \url{https://github.com/gregor-ge/mBLIP}.",
}
Markdown (Informal)
[mBLIP: Efficient Bootstrapping of Multilingual Vision-LLMs](https://aclanthology.org/2024.alvr-1.2) (Geigle et al., ALVR-WS 2024)
ACL