@inproceedings{shi-etal-2024-find,
title = "Find-the-Common: A Benchmark for Explaining Visual Patterns from Images",
author = "Shi, Yuting and
Inoue, Naoya and
Wei, Houjing and
Zhao, Yufeng and
Jin, Tao",
editor = "Calzolari, Nicoletta and
Kan, Min-Yen and
Hoste, Veronique and
Lenci, Alessandro and
Sakti, Sakriani and
Xue, Nianwen",
booktitle = "Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)",
month = may,
year = "2024",
address = "Torino, Italia",
publisher = "ELRA and ICCL",
url = "https://preview.aclanthology.org/fix-sig-urls/2024.lrec-main.642/",
pages = "7307--7313",
abstract = "Recent advances in Instruction-fine-tuned Vision and Language Models (IVLMs), such as GPT-4V and InstructBLIP, have prompted some studies have started an in-depth analysis of the reasoning capabilities of IVLMs. However, Inductive Visual Reasoning, a vital skill for text-image understanding, remains underexplored due to the absence of benchmarks. In this paper, we introduce Find-the-Common (FTC): a new vision and language task for Inductive Visual Reasoning. In this task, models are required to identify an answer that explains the common attributes across visual scenes. We create a new dataset for the FTC and assess the performance of several contemporary approaches including Image-Based Reasoning, Text-Based Reasoning, and Image-Text-Based Reasoning with various models. Extensive experiments show that even state-of-the-art models like GPT-4V can only archive with 48{\%} accuracy on the FTC, for which, the FTC is a new challenge for the visual reasoning research community. Our dataset has been released and is available online: https://github.com/SSSSSeki/Find-the-common."
}
Markdown (Informal)
[Find-the-Common: A Benchmark for Explaining Visual Patterns from Images](https://preview.aclanthology.org/fix-sig-urls/2024.lrec-main.642/) (Shi et al., LREC-COLING 2024)
ACL