@inproceedings{kralev-2025-fusion,
title = "Fusion of Object-Centric and Linguistic Features for Domain-Adapted Multimodal Learning",
author = "Kralev, Jordan Konstantinov",
editor = "Angelova, Galia and
Kunilovskaya, Maria and
Escribe, Marie and
Mitkov, Ruslan",
booktitle = "Proceedings of the 15th International Conference on Recent Advances in Natural Language Processing - Natural Language Processing in the Generative AI Era",
month = sep,
year = "2025",
address = "Varna, Bulgaria",
publisher = "INCOMA Ltd., Shoumen, Bulgaria",
url = "https://preview.aclanthology.org/corrections-2026-01/2025.ranlp-1.69/",
pages = "587--594",
abstract = "Modern multimodal systems often struggle to link domain-specific visual content with textual descriptions, especially when object recognition is limited to general categories (e.g. COCO classes) and lacks customised adaptation to language models. In this paper, we present a novel framework that integrates a domain-specific adapted Detectron2 model into predefined models via a trainable projection layer, enabling precise crossmodal adaptation for specialised domains. Our approach extends Detectron2{'}s recognition capabilities to new categories by fine-tuning on multi-domain datasets, while a lightweight linear projection layer maps region-based visual features to the model{'}s embedding space without completely retraining the model. We evaluated the framework for domain-specific image captioning. The presented approach provides a scalable design for combining domain-specific visual recognition with language inference, with applications in domains that require fine-grained multimodal understanding."
}Markdown (Informal)
[Fusion of Object-Centric and Linguistic Features for Domain-Adapted Multimodal Learning](https://preview.aclanthology.org/corrections-2026-01/2025.ranlp-1.69/) (Kralev, RANLP 2025)
ACL