@inproceedings{tan-bansal-2020-vokenization,
title = "Vokenization: Improving Language Understanding with Contextualized, Visual-Grounded Supervision",
author = "Tan, Hao and
Bansal, Mohit",
editor = "Webber, Bonnie and
Cohn, Trevor and
He, Yulan and
Liu, Yang",
booktitle = "Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP)",
month = nov,
year = "2020",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/add-emnlp-2024-awards/2020.emnlp-main.162/",
doi = "10.18653/v1/2020.emnlp-main.162",
pages = "2066--2080",
abstract = "Humans learn language by listening, speaking, writing, reading, and also, via interaction with the multimodal real world. Existing language pre-training frameworks show the effectiveness of text-only self-supervision while we explore the idea of a visually-supervised language model in this paper. We find that the main reason hindering this exploration is the large divergence in magnitude and distributions between the visually-grounded language datasets and pure-language corpora. Therefore, we develop a technique named {\textquotedblleft}vokenization{\textquotedblright} that extrapolates multimodal alignments to language-only data by contextually mapping language tokens to their related images (which we call {\textquotedblleft}vokens{\textquotedblright}). The {\textquotedblleft}vokenizer{\textquotedblright} is trained on relatively small image captioning datasets and we then apply it to generate vokens for large language corpora. Trained with these contextually generated vokens, our visually-supervised language models show consistent improvements over self-supervised alternatives on multiple pure-language tasks such as GLUE, SQuAD, and SWAG."
}
Markdown (Informal)
[Vokenization: Improving Language Understanding with Contextualized, Visual-Grounded Supervision](https://preview.aclanthology.org/add-emnlp-2024-awards/2020.emnlp-main.162/) (Tan & Bansal, EMNLP 2020)
ACL