@inproceedings{chen-etal-2019-weakly,
title = "Weakly-Supervised Spatio-Temporally Grounding Natural Sentence in Video",
author = "Chen, Zhenfang and
Ma, Lin and
Luo, Wenhan and
Wong, Kwan-Yee Kenneth",
editor = "Korhonen, Anna and
Traum, David and
M{\`a}rquez, Llu{\'i}s",
booktitle = "Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics",
month = jul,
year = "2019",
address = "Florence, Italy",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/fix-sig-urls/P19-1183/",
doi = "10.18653/v1/P19-1183",
pages = "1884--1894",
abstract = "In this paper, we address a novel task, namely weakly-supervised spatio-temporally grounding natural sentence in video. Specifically, given a natural sentence and a video, we localize a spatio-temporal tube in the video that semantically corresponds to the given sentence, with no reliance on any spatio-temporal annotations during training. First, a set of spatio-temporal tubes, referred to as instances, are extracted from the video. We then encode these instances and the sentence using our newly proposed attentive interactor which can exploit their fine-grained relationships to characterize their matching behaviors. Besides a ranking loss, a novel diversity loss is introduced to train our attentive interactor to strengthen the matching behaviors of reliable instance-sentence pairs and penalize the unreliable ones. We also contribute a dataset, called VID-sentence, based on the ImageNet video object detection dataset, to serve as a benchmark for our task. Results from extensive experiments demonstrate the superiority of our model over the baseline approaches."
}
Markdown (Informal)
[Weakly-Supervised Spatio-Temporally Grounding Natural Sentence in Video](https://preview.aclanthology.org/fix-sig-urls/P19-1183/) (Chen et al., ACL 2019)
ACL