@inproceedings{diem-etal-2023-university,
title = "{U}niversity of {H}ildesheim at {S}em{E}val-2023 Task 1: Combining Pre-trained Multimodal and Generative Models for Image Disambiguation",
author = "Diem, Sebastian and
Im, Chan Jong and
Mandl, Thomas",
editor = {Ojha, Atul Kr. and
Do{\u{g}}ru{\"o}z, A. Seza and
Da San Martino, Giovanni and
Tayyar Madabushi, Harish and
Kumar, Ritesh and
Sartori, Elisa},
booktitle = "Proceedings of the 17th International Workshop on Semantic Evaluation (SemEval-2023)",
month = jul,
year = "2023",
address = "Toronto, Canada",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/add-emnlp-2024-awards/2023.semeval-1.18/",
doi = "10.18653/v1/2023.semeval-1.18",
pages = "130--135",
abstract = "Multimodal ambiguity is a challenge for understanding text and images. Large pre-trained models have reached a high level of quality already. This paper presents an implementation for solving a image disambiguation task relying solely on the knowledge captured in multimodal and language models. Within the task 1 of SemEval 2023 (Visual Word Sense Disambiguation), this approach managed to achieve an MRR of 0.738 using CLIP-Large and the OPT model for generating text. Applying a generative model to create more text given a phrase with an ambiguous word leads to an improvement of our results. The performance gain from a bigger language model is larger than the performance gain from using the lager CLIP model."
}
Markdown (Informal)
[University of Hildesheim at SemEval-2023 Task 1: Combining Pre-trained Multimodal and Generative Models for Image Disambiguation](https://preview.aclanthology.org/add-emnlp-2024-awards/2023.semeval-1.18/) (Diem et al., SemEval 2023)
ACL