@inproceedings{choi-etal-2024-transfercvlm,
title = "{T}ransfer{CVLM}: Transferring Cross-Modal Knowledge for Vision-Language Modeling",
author = "Choi, Dongha and
Kim, Jung-jae and
Lee, Hyunju",
editor = "Al-Onaizan, Yaser and
Bansal, Mohit and
Chen, Yun-Nung",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2024",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/add-emnlp-2024-awards/2024.findings-emnlp.975/",
doi = "10.18653/v1/2024.findings-emnlp.975",
pages = "16733--16746",
abstract = "Recent large vision-language multimodal models pre-trained with huge amount of image-text pairs show remarkable performances in downstream tasks. However, the multimodal pre-training has limitations in terms of resources and training time when it comes to obtaining new models that surpass existing models. To overcome these issues, we propose TransferCVLM, a method of efficient knowledge transfer that integrates pre-trained uni-modal models (and cross-modal fusion-encoder) into a combined vision-language model (CVLM), without pre-training the CVLM with large amount of multimodal data, and then for each task application, fine-tunes the CVLM and transfers the multimodal knowledge of a teacher vision-language model to the CVLM by using knowledge distillation techniques. We demonstrate that 1) the fine-tuned CVLM performs comparable to other vision-language models of similar size, that 2) the multimodal knowledge transfer consistently enhances the CVLM, and the knowledge-transferred CVLM composed of large-size unimodal models outperforms the teacher multimodal model in most of downstream tasks, and that 3) TransferCVLM can also be used for model compression when using small-size unimodal models. We estimate that the training of TransferCVLM takes only 6{\%} of pre-training of other vision-language models. Our code is available at https://github.com/DMCB-GIST/TransferCVLM."
}
Markdown (Informal)
[TransferCVLM: Transferring Cross-Modal Knowledge for Vision-Language Modeling](https://preview.aclanthology.org/add-emnlp-2024-awards/2024.findings-emnlp.975/) (Choi et al., Findings 2024)
ACL