@inproceedings{zhou-etal-2025-human,
title = "On the Human-level Performance of Visual Question Answering",
author = "Zhou, Chenlian and
Chen, Guanyi and
Bai, Xin and
Dong, Ming",
editor = "Rambow, Owen and
Wanner, Leo and
Apidianaki, Marianna and
Al-Khalifa, Hend and
Eugenio, Barbara Di and
Schockaert, Steven",
booktitle = "Proceedings of the 31st International Conference on Computational Linguistics",
month = jan,
year = "2025",
address = "Abu Dhabi, UAE",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/jlcl-multiple-ingestion/2025.coling-main.277/",
pages = "4109--4113",
abstract = "Visual7W has been widely used in assessing multiple-choice visual question-answering (VQA) systems. This paper reports on a replicated human experiment on Visual7W with the aim of understanding the human-level performance of VQA. The replication was not entirely successful because human participants performed significantly worse when answering {\textquotedblleft}where{\textquotedblright}, {\textquotedblleft}when{\textquotedblright}, and {\textquotedblleft}how{\textquotedblright} questions in compared to other question types. An error analysis discovered that the failure was a consequence of the non-deterministic distractors in Visual7W. GPT-4V was then evaluated using and was compared to the human-level performance. The results embody that, when evaluating models' capacity on Visual7W, the performance is not necessarily the higher, the better."
}
Markdown (Informal)
[On the Human-level Performance of Visual Question Answering](https://preview.aclanthology.org/jlcl-multiple-ingestion/2025.coling-main.277/) (Zhou et al., COLING 2025)
ACL