@inproceedings{iki-aizawa-2021-effect,
title = "Effect of Visual Extensions on Natural Language Understanding in Vision-and-Language Models",
author = "Iki, Taichi and
Aizawa, Akiko",
editor = "Moens, Marie-Francine and
Huang, Xuanjing and
Specia, Lucia and
Yih, Scott Wen-tau",
booktitle = "Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2021",
address = "Online and Punta Cana, Dominican Republic",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/fix-sig-urls/2021.emnlp-main.167/",
doi = "10.18653/v1/2021.emnlp-main.167",
pages = "2189--2196",
abstract = "A method for creating a vision-and-language (V{\&}L) model is to extend a language model through structural modifications and V{\&}L pre-training. Such an extension aims to make a V{\&}L model inherit the capability of natural language understanding (NLU) from the original language model. To see how well this is achieved, we propose to evaluate V{\&}L models using an NLU benchmark (GLUE). We compare five V{\&}L models, including single-stream and dual-stream models, trained with the same pre-training. Dual-stream models, with their higher modality independence achieved by approximately doubling the number of parameters, are expected to preserve the NLU capability better. Our main finding is that the dual-stream scores are not much different than the single-stream scores, contrary to expectation. Further analysis shows that pre-training causes the performance drop in NLU tasks with few exceptions. These results suggest that adopting a single-stream structure and devising the pre-training could be an effective method for improving the maintenance of language knowledge in V{\&}L extensions."
}
Markdown (Informal)
[Effect of Visual Extensions on Natural Language Understanding in Vision-and-Language Models](https://preview.aclanthology.org/fix-sig-urls/2021.emnlp-main.167/) (Iki & Aizawa, EMNLP 2021)
ACL