@inproceedings{tanti-etal-2017-role,
title = "What is the Role of Recurrent Neural Networks ({RNN}s) in an Image Caption Generator?",
author = "Tanti, Marc and
Gatt, Albert and
Camilleri, Kenneth",
editor = "Alonso, Jose M. and
Bugar{\'i}n, Alberto and
Reiter, Ehud",
booktitle = "Proceedings of the 10th International Conference on Natural Language Generation",
month = sep,
year = "2017",
address = "Santiago de Compostela, Spain",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/add-emnlp-2024-awards/W17-3506/",
doi = "10.18653/v1/W17-3506",
pages = "51--60",
abstract = "Image captioning has evolved into a core task for Natural Language Generation and has also proved to be an important testbed for deep learning approaches to handling multimodal representations. Most contemporary approaches rely on a combination of a convolutional network to handle image features, and a recurrent network to encode linguistic information. The latter is typically viewed as the primary {\textquotedblleft}generation{\textquotedblright} component. Beyond this high-level characterisation, a CNN+RNN model supports a variety of architectural designs. The dominant model in the literature is one in which visual features encoded by a CNN are {\textquotedblleft}injected{\textquotedblright} as part of the linguistic encoding process, driving the RNN`s linguistic choices. By contrast, it is possible to envisage an architecture in which visual and linguistic features are encoded separately, and merged at a subsequent stage. In this paper, we address two related questions: (1) Is direct injection the best way of combining multimodal information, or is a late merging alternative better for the image captioning task? (2) To what extent should a recurrent network be viewed as actually generating, rather than simply encoding, linguistic information?"
}
Markdown (Informal)
[What is the Role of Recurrent Neural Networks (RNNs) in an Image Caption Generator?](https://preview.aclanthology.org/add-emnlp-2024-awards/W17-3506/) (Tanti et al., INLG 2017)
ACL