@inproceedings{goel-etal-2023-semi,
title = "Semi-supervised multimodal coreference resolution in image narrations",
author = "Goel, Arushi and
Fernando, Basura and
Keller, Frank and
Bilen, Hakan",
editor = "Bouamor, Houda and
Pino, Juan and
Bali, Kalika",
booktitle = "Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing",
month = dec,
year = "2023",
address = "Singapore",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/fix-sig-urls/2023.emnlp-main.682/",
doi = "10.18653/v1/2023.emnlp-main.682",
pages = "11067--11081",
abstract = "In this paper, we study multimodal coreference resolution, specifically where a longer descriptive text, i.e., a narration is paired with an image. This poses significant challenges due to fine-grained image-text alignment, inherent ambiguity present in narrative language, and unavailability of large annotated training sets. To tackle these challenges, we present a data efficient semi-supervised approach that utilizes image-narration pairs to resolve coreferences and narrative grounding in a multimodal context. Our approach incorporates losses for both labeled and unlabeled data within a cross-modal framework. Our evaluation shows that the proposed approach outperforms strong baselines both quantitatively and qualitatively, for the tasks of coreference resolution and narrative grounding."
}
Markdown (Informal)
[Semi-supervised multimodal coreference resolution in image narrations](https://preview.aclanthology.org/fix-sig-urls/2023.emnlp-main.682/) (Goel et al., EMNLP 2023)
ACL