@inproceedings{wang-etal-2019-youmakeup,
title = "{Y}ou{M}akeup: A Large-Scale Domain-Specific Multimodal Dataset for Fine-Grained Semantic Comprehension",
author = "Wang, Weiying and
Wang, Yongcheng and
Chen, Shizhe and
Jin, Qin",
editor = "Inui, Kentaro and
Jiang, Jing and
Ng, Vincent and
Wan, Xiaojun",
booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP)",
month = nov,
year = "2019",
address = "Hong Kong, China",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/add-emnlp-2024-awards/D19-1517/",
doi = "10.18653/v1/D19-1517",
pages = "5133--5143",
abstract = "Multimodal semantic comprehension has attracted increasing research interests recently such as visual question answering and caption generation. However, due to the data limitation, fine-grained semantic comprehension has not been well investigated, which requires to capture semantic details of multimodal contents. In this work, we introduce {\textquotedblleft}YouMakeup{\textquotedblright}, a large-scale multimodal instructional video dataset to support fine-grained semantic comprehension research in specific domain. YouMakeup contains 2,800 videos from YouTube, spanning more than 420 hours in total. Each video is annotated with a sequence of natural language descriptions for instructional steps, grounded in temporal video range and spatial facial areas. The annotated steps in a video involve subtle difference in actions, products and regions, which requires fine-grained understanding and reasoning both temporally and spatially. In order to evaluate models' ability for fined-grained comprehension, we further propose two groups of tasks including generation tasks and visual question answering from different aspects. We also establish a baseline of step caption generation for future comparison. The dataset will be publicly available at \url{https://github.com/AIM3-RUC/YouMakeup} to support research investigation in fine-grained semantic comprehension."
}
Markdown (Informal)
[YouMakeup: A Large-Scale Domain-Specific Multimodal Dataset for Fine-Grained Semantic Comprehension](https://preview.aclanthology.org/add-emnlp-2024-awards/D19-1517/) (Wang et al., EMNLP-IJCNLP 2019)
ACL