@inproceedings{lu-etal-2019-debug,
title = "{DEBUG}: A Dense Bottom-Up Grounding Approach for Natural Language Video Localization",
author = "Lu, Chujie and
Chen, Long and
Tan, Chilie and
Li, Xiaolin and
Xiao, Jun",
editor = "Inui, Kentaro and
Jiang, Jing and
Ng, Vincent and
Wan, Xiaojun",
booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP)",
month = nov,
year = "2019",
address = "Hong Kong, China",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/jlcl-multiple-ingestion/D19-1518/",
doi = "10.18653/v1/D19-1518",
pages = "5144--5153",
abstract = "In this paper, we focus on natural language video localization: localizing (ie, grounding) a natural language description in a long and untrimmed video sequence. All currently published models for addressing this problem can be categorized into two types: (i) top-down approach: it does classification and regression for a set of pre-cut video segment candidates; (ii) bottom-up approach: it directly predicts probabilities for each video frame as the temporal boundaries (ie, start and end time point). However, both two approaches suffer several limitations: the former is computation-intensive for densely placed candidates, while the latter has trailed the performance of the top-down counterpart thus far. To this end, we propose a novel dense bottom-up framework: DEnse Bottom-Up Grounding (DEBUG). DEBUG regards all frames falling in the ground truth segment as foreground, and each foreground frame regresses the unique distances from its location to bi-directional ground truth boundaries. Extensive experiments on three challenging benchmarks (TACoS, Charades-STA, and ActivityNet Captions) show that DEBUG is able to match the speed of bottom-up models while surpassing the performance of the state-of-the-art top-down models."
}
Markdown (Informal)
[DEBUG: A Dense Bottom-Up Grounding Approach for Natural Language Video Localization](https://preview.aclanthology.org/jlcl-multiple-ingestion/D19-1518/) (Lu et al., EMNLP-IJCNLP 2019)
ACL