@inproceedings{ilharco-etal-2019-large,
title = "Large-Scale Representation Learning from Visually Grounded Untranscribed Speech",
author = "Ilharco, Gabriel and
Zhang, Yuan and
Baldridge, Jason",
editor = "Bansal, Mohit and
Villavicencio, Aline",
booktitle = "Proceedings of the 23rd Conference on Computational Natural Language Learning (CoNLL)",
month = nov,
year = "2019",
address = "Hong Kong, China",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/Add-Cong-Liu-Florida-Atlantic-University-author-id/K19-1006/",
doi = "10.18653/v1/K19-1006",
pages = "55--65",
abstract = "Systems that can associate images with their spoken audio captions are an important step towards visually grounded language learning. We describe a scalable method to automatically generate diverse audio for image captioning datasets. This supports pretraining deep networks for encoding both audio and images, which we do via a dual encoder that learns to align latent representations from both modalities. We show that a masked margin softmax loss for such models is superior to the standard triplet loss. We fine-tune these models on the Flickr8k Audio Captions Corpus and obtain state-of-the-art results{---}improving recall in the top 10 from 29.6{\%} to 49.5{\%}. We also obtain human ratings on retrieval outputs to better assess the impact of incidentally matching image-caption pairs that were not associated in the data, finding that automatic evaluation substantially underestimates the quality of the retrieved results."
}
Markdown (Informal)
[Large-Scale Representation Learning from Visually Grounded Untranscribed Speech](https://preview.aclanthology.org/Add-Cong-Liu-Florida-Atlantic-University-author-id/K19-1006/) (Ilharco et al., CoNLL 2019)
ACL