@inproceedings{shoer-kementchedjhieva-2025-simple,
title = "A Simple Data Augmentation Strategy for Text-in-Image Scientific {VQA}",
author = "Shoer, Belal and
Kementchedjhieva, Yova",
editor = "Zhang, Chen and
Allaway, Emily and
Shen, Hua and
Miculicich, Lesly and
Li, Yinqiao and
M'hamdi, Meryem and
Limkonchotiwat, Peerat and
Bai, Richard He and
T.y.s.s., Santosh and
Han, Sophia Simeng and
Thapa, Surendrabikram and
Rim, Wiem Ben",
booktitle = "Proceedings of the 9th Widening NLP Workshop",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-emnlp/2025.winlp-main.17/",
pages = "100--105",
ISBN = "979-8-89176-351-7",
abstract = "Scientific visual question answering poses significant challenges for vision-language models due to the complexity of scientific figures and their multimodal context. Traditional approaches treat the figure and accompanying text (e.g., questions and answer options) as separate inputs. EXAMS-V introduced a new paradigm by embedding both visual and textual content into a single image. However, even state-of-the-art proprietary models perform poorly on this setup in zero-shot settings, underscoring the need for task-specific fine-tuning. To address the scarcity of training data in this ``text-in-image'' format, we synthesize a new dataset by converting existing separate image-text pairs into unified images. Fine-tuning a small multilingual multimodal model on a mix of our synthetic data and EXAMS-V yields notable gains across 13 languages, demonstrating strong average improvements and cross-lingual transfer."
}Markdown (Informal)
[A Simple Data Augmentation Strategy for Text-in-Image Scientific VQA](https://preview.aclanthology.org/ingest-emnlp/2025.winlp-main.17/) (Shoer & Kementchedjhieva, WiNLP 2025)
ACL