@inproceedings{sampat-etal-2020-visuo,
title = "Visuo-Linguistic Question Answering ({VLQA}) Challenge",
author = "Sampat, Shailaja Keyur and
Yang, Yezhou and
Baral, Chitta",
editor = "Cohn, Trevor and
He, Yulan and
Liu, Yang",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2020",
month = nov,
year = "2020",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/fix-sig-urls/2020.findings-emnlp.413/",
doi = "10.18653/v1/2020.findings-emnlp.413",
pages = "4606--4616",
abstract = "Understanding images and text together is an important aspect of cognition and building advanced Artificial Intelligence (AI) systems. As a community, we have achieved good benchmarks over language and vision domains separately, however joint reasoning is still a challenge for state-of-the-art computer vision and natural language processing (NLP) systems. We propose a novel task to derive joint inference about a given image-text modality and compile the Visuo-Linguistic Question Answering (VLQA) challenge corpus in a question answering setting. Each dataset item consists of an image and a reading passage, where questions are designed to combine both visual and textual information i.e., ignoring either modality would make the question unanswerable. We first explore the best existing vision-language architectures to solve VLQA subsets and show that they are unable to reason well. We then develop a modular method with slightly better baseline performance, but it is still far behind human performance. We believe that VLQA will be a good benchmark for reasoning over a visuo-linguistic context. The dataset, code and leaderboard is available at \url{https://shailaja183.github.io/vlqa/}."
}
Markdown (Informal)
[Visuo-Linguistic Question Answering (VLQA) Challenge](https://preview.aclanthology.org/fix-sig-urls/2020.findings-emnlp.413/) (Sampat et al., Findings 2020)
ACL