@inproceedings{wang-etal-2024-sco,
title = "{SCO}-{VIST}: Social Interaction Commonsense Knowledge-based Visual Storytelling",
author = "Wang, Eileen and
Han, Caren and
Poon, Josiah",
editor = "Graham, Yvette and
Purver, Matthew",
booktitle = "Proceedings of the 18th Conference of the European Chapter of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = mar,
year = "2024",
address = "St. Julian{'}s, Malta",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/fix-sig-urls/2024.eacl-long.96/",
pages = "1602--1616",
abstract = "Visual storytelling aims to automatically generate a coherent story based on a given image sequence. Unlike tasks like image captioning, visual stories should contain factual descriptions, worldviews, and human social commonsense to put disjointed elements together to form a coherent and engaging human-writeable story. However, most models mainly focus on applying factual information and using taxonomic/lexical external knowledge when attempting to create stories. This paper introduces SCO-VIST, a framework representing the image sequence as a graph with objects and relations that includes human action motivation and its social interaction commonsense knowledge. SCO-VIST then takes this graph representing plot points and creates bridges between plot points with semantic and occurrence-based edge weights. This weighted story graph produces the storyline in a sequence of events using Floyd-Warshall{'}s algorithm. Our proposed framework produces stories superior across multiple metrics in terms of visual grounding, coherence, diversity, and humanness, per both automatic and human evaluations."
}
Markdown (Informal)
[SCO-VIST: Social Interaction Commonsense Knowledge-based Visual Storytelling](https://preview.aclanthology.org/fix-sig-urls/2024.eacl-long.96/) (Wang et al., EACL 2024)
ACL