@inproceedings{omar-etal-2024-multi,
title = "Multi-Level Information Retrieval Augmented Generation for Knowledge-based Visual Question Answering",
author = "Adjali, Omar and
Ferret, Olivier and
Ghannay, Sahar and
Le Borgne, Herv{\'e}",
editor = "Al-Onaizan, Yaser and
Bansal, Mohit and
Chen, Yun-Nung",
booktitle = "Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/Add-Cong-Liu-Florida-Atlantic-University-author-id/2024.emnlp-main.922/",
doi = "10.18653/v1/2024.emnlp-main.922",
pages = "16499--16513",
abstract = "The Knowledge-Aware Visual Question Answering about Entity task aims to disambiguate entities using textual and visual information, as well as knowledge. It usually relies on two independent steps, information retrieval then reading comprehension, that do not benefit each other. Retrieval Augmented Generation (RAG) offers a solution by using generated answers as feedback for retrieval training. RAG usually relies solely on pseudo-relevant passages retrieved from external knowledge bases which can lead to ineffective answer generation. In this work, we propose a multi-level information RAG approach that enhances answer generation through entity retrieval and query expansion. We formulate a joint-training RAG loss such that answer generation is conditioned on both entity and passage retrievals. We show through experiments new state-of-the-art performance on the VIQuAE KB-VQA benchmark and demonstrate that our approach can help retrieve more actual relevant knowledge to generate accurate answers."
}
Markdown (Informal)
[Multi-Level Information Retrieval Augmented Generation for Knowledge-based Visual Question Answering](https://preview.aclanthology.org/Add-Cong-Liu-Florida-Atlantic-University-author-id/2024.emnlp-main.922/) (Adjali et al., EMNLP 2024)
ACL