@inproceedings{dong-etal-2025-bridging,
title = "Bridging Language and Scenes through Explicit 3-{D} Model Construction",
author = "Dong, Tiansi and
Das, Writwick and
Sifa, Rafet",
editor = "Liu, Kang and
Song, Yangqiu and
Han, Zhen and
Sifa, Rafet and
He, Shizhu and
Long, Yunfei",
booktitle = "Proceedings of Bridging Neurons and Symbols for Natural Language Processing and Knowledge Graphs Reasoning @ COLING 2025",
month = jan,
year = "2025",
address = "Abu Dhabi, UAE",
publisher = "ELRA and ICCL",
url = "https://preview.aclanthology.org/fix-sig-urls/2025.neusymbridge-1.6/",
pages = "51--60",
abstract = "We introduce the methodology of explicit model construction to bridge linguistic descriptions and scene perception and demonstrate that in Visual Question-Answering (VQA) using MC4VQA (Model Construction for Visual Question-Answering), a method developed by us. Given a question about a scene, our MC4VQA first recognizes objects utilizing pre-trained deep learning systems. Then, it constructs an explicit 3-D layout by repeatedly reducing the difference between the input scene image and the image rendered from the current 3-D spatial environment. This novel ``iterative rendering'' process endows MC4VQA the capability of acquiring spatial attributes without training data. MC4VQA outperforms NS-VQA (the SOTA system) by reaching 99.94{\%} accuracy on the benchmark CLEVR datasets, and is more robust than NS-VQA on new testing datasets. With newly created testing data, NS-VQA{'}s performance dropped to 97.60{\%}, while MC4VQA still kept the 99.0{\%} accuracy. This work sets a new SOTA performance of VQA on the benchmark CLEVR datasets, and shapes a new method that may solve the out-of-distribution problem."
}
Markdown (Informal)
[Bridging Language and Scenes through Explicit 3-D Model Construction](https://preview.aclanthology.org/fix-sig-urls/2025.neusymbridge-1.6/) (Dong et al., NeusymBridge 2025)
ACL