@inproceedings{yan-etal-2024-multi,
title = "Multi-modal Concept Alignment Pre-training for Generative Medical Visual Question Answering",
author = "Yan, Quan and
Duan, Junwen and
Wang, Jianxin",
editor = "Ku, Lun-Wei and
Martins, Andre and
Srikumar, Vivek",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2024",
month = aug,
year = "2024",
address = "Bangkok, Thailand",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/jlcl-multiple-ingestion/2024.findings-acl.319/",
doi = "10.18653/v1/2024.findings-acl.319",
pages = "5378--5389",
abstract = "Medical Visual Question Answering (Med-VQA) seeks to accurately respond to queries regarding medical images, a task particularly challenging for open-ended questions. This study unveils the Multi-modal Concept Alignment Pre-training (MMCAP) approach for generative Med-VQA, leveraging a knowledge graph sourced from medical image-caption datasets and the Unified Medical Language System. MMCAP advances the fusion of visual and textual medical knowledge via a graph attention network and a transformer decoder. Additionally, it incorporates a Type Conditional Prompt in the fine-tuning phase, markedly boosting the accuracy and relevance of answers to open-ended questions. Our tests on benchmark datasets illustrate MMCAP`s superiority over existing methods, demonstrating its high efficiency in data-limited settings and effective knowledge-image alignment capability."
}
Markdown (Informal)
[Multi-modal Concept Alignment Pre-training for Generative Medical Visual Question Answering](https://preview.aclanthology.org/jlcl-multiple-ingestion/2024.findings-acl.319/) (Yan et al., Findings 2024)
ACL