@inproceedings{borisova-etal-2025-scivqa,
title = "{S}ci{VQA} 2025: Overview of the First Scientific Visual Question Answering Shared Task",
author = "Borisova, Ekaterina and
Rauscher, Nikolas and
Rehm, Georg",
editor = "Ghosal, Tirthankar and
Mayr, Philipp and
Singh, Amanpreet and
Naik, Aakanksha and
Rehm, Georg and
Freitag, Dayne and
Li, Dan and
Schimmler, Sonja and
De Waard, Anita",
booktitle = "Proceedings of the Fifth Workshop on Scholarly Document Processing (SDP 2025)",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/landing_page/2025.sdp-1.18/",
doi = "10.18653/v1/2025.sdp-1.18",
pages = "182--210",
ISBN = "979-8-89176-265-7",
abstract = "This paper provides an overview of the First Scientific Visual Question Answering (SciVQA) shared task conducted as part of the Fifth Scholarly Document Processing workshop (SDP 2025). SciVQA aims to explore the capabilities of current multimodal large language models (MLLMs) in reasoning over figures from scholarly publications for question answering (QA). The main focus of the challenge is on closed-ended visual and non-visual QA pairs. We developed the novel SciVQA benchmark comprising 3,000 images of figures and a total of 21,000 QA pairs. The shared task received seven submissions, with the best performing system achieving an average F1 score of approx. 0.86 across ROUGE-1, ROUGE-L, and BertScore metrics. Participating teams explored various fine-tuning and prompting strategies, as well as augmenting the SciVQA dataset with out-of-domain data and incorporating relevant context from source publications. The findings indicate that while MLLMs demonstrate strong performance on SciVQA, they face challenges in visual reasoning and still fall behind human judgments."
}
Markdown (Informal)
[SciVQA 2025: Overview of the First Scientific Visual Question Answering Shared Task](https://preview.aclanthology.org/landing_page/2025.sdp-1.18/) (Borisova et al., sdp 2025)
ACL