@inproceedings{yang-etal-2025-storyllava,
title = "{S}tory{LL}a{VA}: Enhancing Visual Storytelling with Multi-Modal Large Language Models",
author = "Yang, Li and
Xiao, Zhiding and
Huang, Wenxin and
Zhong, Xian",
editor = "Rambow, Owen and
Wanner, Leo and
Apidianaki, Marianna and
Al-Khalifa, Hend and
Eugenio, Barbara Di and
Schockaert, Steven",
booktitle = "Proceedings of the 31st International Conference on Computational Linguistics",
month = jan,
year = "2025",
address = "Abu Dhabi, UAE",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/jlcl-multiple-ingestion/2025.coling-main.266/",
pages = "3936--3951",
abstract = "The rapid development of multimodal large language models (MLLMs) has positioned visual storytelling as a crucial area in content creation. However, existing models often struggle to maintain temporal, spatial, and narrative coherence across image sequences, and they frequently lack the depth and engagement of human-authored stories. To address these challenges, we propose Story with Large Language-and-Vision Alignment (StoryLLaVA), a novel framework for enhancing visual storytelling. Our approach introduces a topic-driven narrative optimizer that improves both the training data and MLLM models by integrating image descriptions, topic generation, and GPT-4-based refinements. Furthermore, we employ a preference-based ranked story sampling method that aligns model outputs with human storytelling preferences through positive-negative pairing. These two phases of the framework differ in their training methods: the former uses supervised fine-tuning, while the latter incorporates reinforcement learning with positive and negative sample pairs. Experimental results demonstrate that StoryLLaVA outperforms current models in visual relevance, coherence, and fluency, with LLM-based evaluations confirming the generation of richer and more engaging narratives. The enhanced dataset and model will be made publicly available soon."
}
Markdown (Informal)
[StoryLLaVA: Enhancing Visual Storytelling with Multi-Modal Large Language Models](https://preview.aclanthology.org/jlcl-multiple-ingestion/2025.coling-main.266/) (Yang et al., COLING 2025)
ACL