@inproceedings{zhang-wan-2023-exploring,
title = "Exploring the Impact of Vision Features in News Image Captioning",
author = "Zhang, Junzhe and
Wan, Xiaojun",
editor = "Rogers, Anna and
Boyd-Graber, Jordan and
Okazaki, Naoaki",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2023",
month = jul,
year = "2023",
address = "Toronto, Canada",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/jlcl-multiple-ingestion/2023.findings-acl.818/",
doi = "10.18653/v1/2023.findings-acl.818",
pages = "12923--12936",
abstract = "The task of news image captioning aims to generate a detailed caption which describes the specific information of an image in a news article. However, we find that recent state-of-art models can achieve competitive performance even without vision features. To resolve the impact of vision features in the news image captioning task, we conduct extensive experiments with mainstream models based on encoder-decoder framework. From our exploration, we find 1) vision features do contribute to the generation of news image captions; 2) vision features can assist models to better generate entities of captions when the entity information is sufficient in the input textual context of the given article; 3) Regions of specific objects in images contribute to the generation of related entities in captions."
}
Markdown (Informal)
[Exploring the Impact of Vision Features in News Image Captioning](https://preview.aclanthology.org/jlcl-multiple-ingestion/2023.findings-acl.818/) (Zhang & Wan, Findings 2023)
ACL