@inproceedings{li-etal-2018-tell,
title = "Tell-and-Answer: Towards Explainable Visual Question Answering using Attributes and Captions",
author = "Li, Qing and
Fu, Jianlong and
Yu, Dongfei and
Mei, Tao and
Luo, Jiebo",
editor = "Riloff, Ellen and
Chiang, David and
Hockenmaier, Julia and
Tsujii, Jun{'}ichi",
booktitle = "Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing",
month = oct # "-" # nov,
year = "2018",
address = "Brussels, Belgium",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/jlcl-multiple-ingestion/D18-1164/",
doi = "10.18653/v1/D18-1164",
pages = "1338--1346",
abstract = "In Visual Question Answering, most existing approaches adopt the pipeline of representing an image via pre-trained CNNs, and then using the uninterpretable CNN features in conjunction with the question to predict the answer. Although such end-to-end models might report promising performance, they rarely provide any insight, apart from the answer, into the VQA process. In this work, we propose to break up the end-to-end VQA into two steps: explaining and reasoning, in an attempt towards a more explainable VQA by shedding light on the intermediate results between these two steps. To that end, we first extract attributes and generate descriptions as explanations for an image. Next, a reasoning module utilizes these explanations in place of the image to infer an answer. The advantages of such a breakdown include: (1) the attributes and captions can reflect what the system extracts from the image, thus can provide some insights for the predicted answer; (2) these intermediate results can help identify the inabilities of the image understanding or the answer inference part when the predicted answer is wrong. We conduct extensive experiments on a popular VQA dataset and our system achieves comparable performance with the baselines, yet with added benefits of explanability and the inherent ability to further improve with higher quality explanations."
}
Markdown (Informal)
[Tell-and-Answer: Towards Explainable Visual Question Answering using Attributes and Captions](https://preview.aclanthology.org/jlcl-multiple-ingestion/D18-1164/) (Li et al., EMNLP 2018)
ACL