@inproceedings{fan-etal-2019-bridging,
title = "Bridging by Word: Image Grounded Vocabulary Construction for Visual Captioning",
author = "Fan, Zhihao and
Wei, Zhongyu and
Wang, Siyuan and
Huang, Xuanjing",
editor = "Korhonen, Anna and
Traum, David and
M{\`a}rquez, Llu{\'i}s",
booktitle = "Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics",
month = jul,
year = "2019",
address = "Florence, Italy",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/fix-sig-urls/P19-1652/",
doi = "10.18653/v1/P19-1652",
pages = "6514--6524",
abstract = "Image Captioning aims at generating a short description for an image. Existing research usually employs the architecture of CNN-RNN that views the generation as a sequential decision-making process and the entire dataset vocabulary is used as decoding space. They suffer from generating high frequent n-gram with irrelevant words. To tackle this problem, we propose to construct an image-grounded vocabulary, based on which, captions are generated with limitation and guidance. In specific, a novel hierarchical structure is proposed to construct the vocabulary incorporating both visual information and relations among words. For generation, we propose a word-aware RNN cell incorporating vocabulary information into the decoding process directly. Reinforce algorithm is employed to train the generator using constraint vocabulary as action space. Experimental results on MS COCO and Flickr30k show the effectiveness of our framework compared to some state-of-the-art models."
}
Markdown (Informal)
[Bridging by Word: Image Grounded Vocabulary Construction for Visual Captioning](https://preview.aclanthology.org/fix-sig-urls/P19-1652/) (Fan et al., ACL 2019)
ACL