@inproceedings{gella-etal-2017-image,
title = "Image Pivoting for Learning Multilingual Multimodal Representations",
author = "Gella, Spandana and
Sennrich, Rico and
Keller, Frank and
Lapata, Mirella",
editor = "Palmer, Martha and
Hwa, Rebecca and
Riedel, Sebastian",
booktitle = "Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing",
month = sep,
year = "2017",
address = "Copenhagen, Denmark",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/add-emnlp-2024-awards/D17-1303/",
doi = "10.18653/v1/D17-1303",
pages = "2839--2845",
abstract = "In this paper we propose a model to learn multimodal multilingual representations for matching images and sentences in different languages, with the aim of advancing multilingual versions of image search and image understanding. Our model learns a common representation for images and their descriptions in two different languages (which need not be parallel) by considering the image as a pivot between two languages. We introduce a new pairwise ranking loss function which can handle both symmetric and asymmetric similarity between the two modalities. We evaluate our models on image-description ranking for German and English, and on semantic textual similarity of image descriptions in English. In both cases we achieve state-of-the-art performance."
}
Markdown (Informal)
[Image Pivoting for Learning Multilingual Multimodal Representations](https://preview.aclanthology.org/add-emnlp-2024-awards/D17-1303/) (Gella et al., EMNLP 2017)
ACL