@inproceedings{huang-etal-2019-multi,
title = "Multi-Head Attention with Diversity for Learning Grounded Multilingual Multimodal Representations",
author = "Huang, Po-Yao and
Chang, Xiaojun and
Hauptmann, Alexander",
editor = "Inui, Kentaro and
Jiang, Jing and
Ng, Vincent and
Wan, Xiaojun",
booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP)",
month = nov,
year = "2019",
address = "Hong Kong, China",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/fix-sig-urls/D19-1154/",
doi = "10.18653/v1/D19-1154",
pages = "1461--1467",
abstract = "With the aim of promoting and understanding the multilingual version of image search, we leverage visual object detection and propose a model with diverse multi-head attention to learn grounded multilingual multimodal representations. Specifically, our model attends to different types of textual semantics in two languages and visual objects for fine-grained alignments between sentences and images. We introduce a new objective function which explicitly encourages attention diversity to learn an improved visual-semantic embedding space. We evaluate our model in the German-Image and English-Image matching tasks on the Multi30K dataset, and in the Semantic Textual Similarity task with the English descriptions of visual content. Results show that our model yields a significant performance gain over other methods in all of the three tasks."
}
Markdown (Informal)
[Multi-Head Attention with Diversity for Learning Grounded Multilingual Multimodal Representations](https://preview.aclanthology.org/fix-sig-urls/D19-1154/) (Huang et al., EMNLP-IJCNLP 2019)
ACL