@inproceedings{kiela-etal-2018-learning,
title = "Learning Visually Grounded Sentence Representations",
author = "Kiela, Douwe and
Conneau, Alexis and
Jabri, Allan and
Nickel, Maximilian",
editor = "Walker, Marilyn and
Ji, Heng and
Stent, Amanda",
booktitle = "Proceedings of the 2018 Conference of the North {A}merican Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long Papers)",
month = jun,
year = "2018",
address = "New Orleans, Louisiana",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/jlcl-multiple-ingestion/N18-1038/",
doi = "10.18653/v1/N18-1038",
pages = "408--418",
abstract = "We investigate grounded sentence representations, where we train a sentence encoder to predict the image features of a given caption{---}i.e., we try to {\textquotedblleft}imagine{\textquotedblright} how a sentence would be depicted visually{---}and use the resultant features as sentence representations. We examine the quality of the learned representations on a variety of standard sentence representation quality benchmarks, showing improved performance for grounded models over non-grounded ones. In addition, we thoroughly analyze the extent to which grounding contributes to improved performance, and show that the system also learns improved word embeddings."
}
Markdown (Informal)
[Learning Visually Grounded Sentence Representations](https://preview.aclanthology.org/jlcl-multiple-ingestion/N18-1038/) (Kiela et al., NAACL 2018)
ACL
- Douwe Kiela, Alexis Conneau, Allan Jabri, and Maximilian Nickel. 2018. Learning Visually Grounded Sentence Representations. In Proceedings of the 2018 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long Papers), pages 408–418, New Orleans, Louisiana. Association for Computational Linguistics.