@inproceedings{reshetnikov-marinescu-2025-caption,
title = "Caption Generation in Cultural Heritage: Crowdsourced Data and Tuning Multimodal Large Language Models",
author = "Reshetnikov, Artem and
Marinescu, Maria-Cristina",
editor = "Nguyen, Duc",
booktitle = "Proceedings of the 1st Workshop on Language Models for Underserved Communities (LM4UC 2025)",
month = may,
year = "2025",
address = "Albuquerque, New Mexico",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/Ingest-2025-COMPUTEL/2025.lm4uc-1.6/",
pages = "42--50",
ISBN = "979-8-89176-242-8",
abstract = "Automated caption generation for paintings enables enhanced access and understanding of visual artworks. This work introduces a novel caption dataset, obtained by manual annotation of about 7500 images from the publicly available DEArt dataset for object detection and pose estimation. Our focus is on describing the visual scenes rather than the context or style of the artwork - more common in other existing captioning datasets. The dataset is the result of a crowdsourcing initiative spanning 13 months, with volunteers adhering to explicit captioning guidelines reflecting our requirements. We provide each artwork in the dataset with five captions, created independently by volunteers to ensure diversity of interpretation and increase the robustness of the captioning model. In addition, we explore using the crowdsourced dataset for fine-tuning Large Language Models with vision encoders for domain-specific caption generation. The goal is to improve the performance of multimodal LLMs in the context of cultural heritage, a domain with {\textquotedblleft}small data{\textquotedblright} which often struggles with the nuanced visual analysis and interpretation required for cultural objects such as paintings. The use of crowdsourced data in the domain adaptation process enables us to incorporate the collective perceptual insights of diverse annotators, resulting in an exploration of visual narratives and observing a reduction in hallucinations otherwise produced by these large language models."
}
Markdown (Informal)
[Caption Generation in Cultural Heritage: Crowdsourced Data and Tuning Multimodal Large Language Models](https://preview.aclanthology.org/Ingest-2025-COMPUTEL/2025.lm4uc-1.6/) (Reshetnikov & Marinescu, LM4UC 2025)
ACL