@inproceedings{liang-etal-2021-graghvqa,
title = "{G}ragh{VQA}: Language-Guided Graph Neural Networks for Graph-based Visual Question Answering",
author = "Liang, Weixin and
Jiang, Yanhao and
Liu, Zixuan",
editor = "Zadeh, Amir and
Morency, Louis-Philippe and
Liang, Paul Pu and
Ross, Candace and
Salakhutdinov, Ruslan and
Poria, Soujanya and
Cambria, Erik and
Shi, Kelly",
booktitle = "Proceedings of the Third Workshop on Multimodal Artificial Intelligence",
month = jun,
year = "2021",
address = "Mexico City, Mexico",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/fix-sig-urls/2021.maiworkshop-1.12/",
doi = "10.18653/v1/2021.maiworkshop-1.12",
pages = "79--86",
abstract = "Images are more than a collection of objects or attributes {---} they represent a web of relationships among interconnected objects. Scene Graph has emerged as a new modality as a structured graphical representation of images. Scene Graph encodes objects as nodes connected via pairwise relations as edges. To support question answering on scene graphs, we propose GraphVQA, a language-guided graph neural network framework that translates and executes a natural language question as multiple iterations of message passing among graph nodes. We explore the design space of GraphVQA framework, and discuss the trade-off of different design choices. Our experiments on GQA dataset show that GraphVQA outperforms the state-of-the-art accuracy by a large margin (88.43{\%} vs. 94.78{\%})."
}
Markdown (Informal)
[GraghVQA: Language-Guided Graph Neural Networks for Graph-based Visual Question Answering](https://preview.aclanthology.org/fix-sig-urls/2021.maiworkshop-1.12/) (Liang et al., maiworkshop 2021)
ACL