@inproceedings{ogezi-shi-2025-spare,
title = "{S}pa{RE}: Enhancing Spatial Reasoning in Vision-Language Models with Synthetic Data",
author = "Ogezi, Michael and
Shi, Freda",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingestion-acl-25/2025.acl-long.387/",
pages = "7855--7875",
ISBN = "979-8-89176-251-0",
abstract = "Vision-language models (VLMs) work well in tasks ranging from image captioning to visual question answering (VQA), yet they struggle with spatial reasoning, a key skill for understanding our physical world that humans excel at. We find that spatial relations are generally rare in widely used VL datasets, with only a few being well represented, while most form a long tail of underrepresented relations. This gap leaves VLMs ill-equipped to handle diverse spatial relationships. To bridge it, we construct a synthetic VQA dataset focused on spatial reasoning generated from hyper-detailed image descriptions in Localized Narratives, DOCCI, and PixMo-Cap. Our dataset consists of 455k samples containing 3.4 million QA pairs. Trained on this dataset, our Spatial-Reasoning Enhanced (SpaRE) VLMs show strong improvements on spatial reasoning benchmarks, achieving up to a 49{\%} performance gain on the What{'}s Up benchmark, while maintaining strong results on general tasks. Our work narrows the gap between human and VLM spatial reasoning and makes VLMs more capable in real-world tasks such as robotics and navigation. We plan to share our code and dataset in due course."
}
Markdown (Informal)
[SpaRE: Enhancing Spatial Reasoning in Vision-Language Models with Synthetic Data](https://preview.aclanthology.org/ingestion-acl-25/2025.acl-long.387/) (Ogezi & Shi, ACL 2025)
ACL