@inproceedings{zhou-etal-2018-visual,
title = "A Visual Attention Grounding Neural Model for Multimodal Machine Translation",
author = "Zhou, Mingyang and
Cheng, Runxiang and
Lee, Yong Jae and
Yu, Zhou",
editor = "Riloff, Ellen and
Chiang, David and
Hockenmaier, Julia and
Tsujii, Jun{'}ichi",
booktitle = "Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing",
month = oct # "-" # nov,
year = "2018",
address = "Brussels, Belgium",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/add-emnlp-2024-awards/D18-1400/",
doi = "10.18653/v1/D18-1400",
pages = "3643--3653",
abstract = "We introduce a novel multimodal machine translation model that utilizes parallel visual and textual information. Our model jointly optimizes the learning of a shared visual-language embedding and a translator. The model leverages a visual attention grounding mechanism that links the visual semantics with the corresponding textual semantics. Our approach achieves competitive state-of-the-art results on the Multi30K and the Ambiguous COCO datasets. We also collected a new multilingual multimodal product description dataset to simulate a real-world international online shopping scenario. On this dataset, our visual attention grounding model outperforms other methods by a large margin."
}
Markdown (Informal)
[A Visual Attention Grounding Neural Model for Multimodal Machine Translation](https://preview.aclanthology.org/add-emnlp-2024-awards/D18-1400/) (Zhou et al., EMNLP 2018)
ACL