@inproceedings{zhao-etal-2025-graph,
title = "A Graph Interaction Framework on Relevance for Multimodal Named Entity Recognition with Multiple Images",
author = "Zhao, Jiachen and
Huang, Shizhou and
Lin, Xin",
editor = "Rambow, Owen and
Wanner, Leo and
Apidianaki, Marianna and
Al-Khalifa, Hend and
Eugenio, Barbara Di and
Schockaert, Steven",
booktitle = "Proceedings of the 31st International Conference on Computational Linguistics",
month = jan,
year = "2025",
address = "Abu Dhabi, UAE",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/fix-sig-urls/2025.coling-main.82/",
pages = "1237--1246",
abstract = "Posts containing multiple images have significant research potential in Multimodal Named Entity Recognition nowadays. The previous methods determine whether the images are related to named entities in the text through similarity computation, such as using CLIP. However, it is not effective in some cases and not conducive to task transfer, especially in multi-image scenarios. To address the issue, we propose a graph interaction framework on relevance (GIFR) for Multimodal Named Entity Recognition with multiple images. For humans, they have the abilities to distinguish whether an image is relevant to named entities, but human capabilities are difficult to model. Therefore, we propose using reinforcement learning based on human preference to integrate human abilities into the model to determine whether an image-text pair is relevant, which is referred to as relevance. To better leverage relevance, we construct a heterogeneous graph and introduce graph transformer to enable information interaction. Experiments on benchmark datasets demonstrate that our method achieves the state-of-the-art performance."
}
Markdown (Informal)
[A Graph Interaction Framework on Relevance for Multimodal Named Entity Recognition with Multiple Images](https://preview.aclanthology.org/fix-sig-urls/2025.coling-main.82/) (Zhao et al., COLING 2025)
ACL