@inproceedings{garg-etal-2019-learning,
title = "Learning to Relate from Captions and Bounding Boxes",
author = "Garg, Sarthak and
Moniz, Joel Ruben Antony and
Aviral, Anshu and
Bollimpalli, Priyatham",
booktitle = "Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics",
month = jul,
year = "2019",
address = "Florence, Italy",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/P19-1660",
doi = "10.18653/v1/P19-1660",
pages = "6597--6603",
abstract = "In this work, we propose a novel approach that predicts the relationships between various entities in an image in a weakly supervised manner by relying on image captions and object bounding box annotations as the sole source of supervision. Our proposed approach uses a top-down attention mechanism to align entities in captions to objects in the image, and then leverage the syntactic structure of the captions to align the relations. We use these alignments to train a relation classification network, thereby obtaining both grounded captions and dense relationships. We demonstrate the effectiveness of our model on the Visual Genome dataset by achieving a recall@50 of 15{\%} and recall@100 of 25{\%} on the relationships present in the image. We also show that the model successfully predicts relations that are not present in the corresponding captions.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="garg-etal-2019-learning">
<titleInfo>
<title>Learning to Relate from Captions and Bounding Boxes</title>
</titleInfo>
<name type="personal">
<namePart type="given">Sarthak</namePart>
<namePart type="family">Garg</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joel</namePart>
<namePart type="given">Ruben</namePart>
<namePart type="given">Antony</namePart>
<namePart type="family">Moniz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anshu</namePart>
<namePart type="family">Aviral</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Priyatham</namePart>
<namePart type="family">Bollimpalli</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2019-jul</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics</title>
</titleInfo>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Florence, Italy</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>In this work, we propose a novel approach that predicts the relationships between various entities in an image in a weakly supervised manner by relying on image captions and object bounding box annotations as the sole source of supervision. Our proposed approach uses a top-down attention mechanism to align entities in captions to objects in the image, and then leverage the syntactic structure of the captions to align the relations. We use these alignments to train a relation classification network, thereby obtaining both grounded captions and dense relationships. We demonstrate the effectiveness of our model on the Visual Genome dataset by achieving a recall@50 of 15% and recall@100 of 25% on the relationships present in the image. We also show that the model successfully predicts relations that are not present in the corresponding captions.</abstract>
<identifier type="citekey">garg-etal-2019-learning</identifier>
<identifier type="doi">10.18653/v1/P19-1660</identifier>
<location>
<url>https://aclanthology.org/P19-1660</url>
</location>
<part>
<date>2019-jul</date>
<extent unit="page">
<start>6597</start>
<end>6603</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Learning to Relate from Captions and Bounding Boxes
%A Garg, Sarthak
%A Moniz, Joel Ruben Antony
%A Aviral, Anshu
%A Bollimpalli, Priyatham
%S Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics
%D 2019
%8 jul
%I Association for Computational Linguistics
%C Florence, Italy
%F garg-etal-2019-learning
%X In this work, we propose a novel approach that predicts the relationships between various entities in an image in a weakly supervised manner by relying on image captions and object bounding box annotations as the sole source of supervision. Our proposed approach uses a top-down attention mechanism to align entities in captions to objects in the image, and then leverage the syntactic structure of the captions to align the relations. We use these alignments to train a relation classification network, thereby obtaining both grounded captions and dense relationships. We demonstrate the effectiveness of our model on the Visual Genome dataset by achieving a recall@50 of 15% and recall@100 of 25% on the relationships present in the image. We also show that the model successfully predicts relations that are not present in the corresponding captions.
%R 10.18653/v1/P19-1660
%U https://aclanthology.org/P19-1660
%U https://doi.org/10.18653/v1/P19-1660
%P 6597-6603
Markdown (Informal)
[Learning to Relate from Captions and Bounding Boxes](https://aclanthology.org/P19-1660) (Garg et al., ACL 2019)
ACL
- Sarthak Garg, Joel Ruben Antony Moniz, Anshu Aviral, and Priyatham Bollimpalli. 2019. Learning to Relate from Captions and Bounding Boxes. In Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics, pages 6597–6603, Florence, Italy. Association for Computational Linguistics.