@inproceedings{han-etal-2024-self,
title = "Self-Adaptive Sampling for Accurate Video Question Answering on Image Text Models",
author = "Han, Wei and
Chen, Hui and
Kan, Min-Yen and
Poria, Soujanya",
editor = "Duh, Kevin and
Gomez, Helena and
Bethard, Steven",
booktitle = "Findings of the Association for Computational Linguistics: NAACL 2024",
month = jun,
year = "2024",
address = "Mexico City, Mexico",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/add-emnlp-2024-awards/2024.findings-naacl.162/",
doi = "10.18653/v1/2024.findings-naacl.162",
pages = "2522--2534",
abstract = "Image{--}text models (ITMs) is the prevalent architecture to solve video question{--}answering tasks, which requires only a few input frames to save huge computational cost compared to video{--}language models.However, we find existent ITM video question{--}answering solutions either 1) adopt simplistic and unintentional sampling strategies, which may miss key frames to offer the answer clues; or 2) sample a large number of frames into divided groups, which the computational sources can not accommodate. In this work, we aim at an efficient sampling method towards the few-frame situations.We first summarize a family of prior sampling methods based on question{--}frame correlation into a unified one, dubbed *Most Implied Frames* (MIF). Through some primary results and analysis, Through analysis, we form a hypothesis that question-aware sampling is not necessary, from which we further propose the other method *Most Dominant Frames* (MDF).Experimental results on four public datasets and three advanced ITMs demonstrate that our proposed strategies can boost the performance for image{--}text pretrained models, and have a wide application scenario in terms of model architectures and dataset types. Our code is available at https://github.com/declare-lab/Sealing\url{https://github.com/declare-lab/Sealing}."
}
Markdown (Informal)
[Self-Adaptive Sampling for Accurate Video Question Answering on Image Text Models](https://preview.aclanthology.org/add-emnlp-2024-awards/2024.findings-naacl.162/) (Han et al., Findings 2024)
ACL