@inproceedings{madhyastha-etal-2018-end,
title = "End-to-end Image Captioning Exploits Distributional Similarity in Multimodal Space",
author = "Madhyastha, Pranava Swaroop and
Wang, Josiah and
Specia, Lucia",
editor = "Linzen, Tal and
Chrupa{\l}a, Grzegorz and
Alishahi, Afra",
booktitle = "Proceedings of the 2018 {EMNLP} Workshop {B}lackbox{NLP}: Analyzing and Interpreting Neural Networks for {NLP}",
month = nov,
year = "2018",
address = "Brussels, Belgium",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/jlcl-multiple-ingestion/W18-5455/",
doi = "10.18653/v1/W18-5455",
pages = "381--383",
abstract = "We hypothesize that end-to-end neural image captioning systems work seemingly well because they exploit and learn {\textquoteleft}distributional similarity' in a multimodal feature space, by mapping a test image to similar training images in this space and generating a caption from the same space. To validate our hypothesis, we focus on the {\textquoteleft}image' side of image captioning, and vary the input image representation but keep the RNN text generation model of a CNN-RNN constant. Our analysis indicates that image captioning models (i) are capable of separating structure from noisy input representations; (ii) experience virtually no significant performance loss when a high dimensional representation is compressed to a lower dimensional space; (iii) cluster images with similar visual and linguistic information together. Our experiments all point to one fact: that our distributional similarity hypothesis holds. We conclude that, regardless of the image representation, image captioning systems seem to match images and generate captions in a learned joint image-text semantic subspace."
}
Markdown (Informal)
[End-to-end Image Captioning Exploits Distributional Similarity in Multimodal Space](https://preview.aclanthology.org/jlcl-multiple-ingestion/W18-5455/) (Madhyastha et al., EMNLP 2018)
ACL