@inproceedings{kapuriya-etal-2026-progressive,
title = "A Progressive Evaluation Framework for Multicultural Analysis of Story Visualization",
author = "Kapuriya, Janak and
Hatami, Ali and
Buitelaar, Paul",
editor = "Mille, Simon and
Gehrmann, Sebastian and
Schmidtov{\'a}, Patr{\'i}cia and
Du{\v{s}}ek, Ond{\v{r}}ej and
Fadaee, Marzieh and
Lo, Kyle and
Santus, Enrico and
Stanovsky, Gabriel",
booktitle = "Proceedings of the Fifth Workshop on Generation, Evaluation and Metrics ({GEM})",
month = jul,
year = "2026",
address = "San Diego, California, USA",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl-workshops/2026.gem-main.39/",
pages = "410--427",
ISBN = "979-8-89176-423-1",
abstract = "Recent advancements in text-to-image generative models have improved narrative consistency in story visualization. However, current story visualization models often overlook cultural dimensions, resulting in visuals that lack cultural fidelity. In this study, we present a progressive evaluation framework for story visualization. We validate this framework on current text-to-image models across three languages (English, Hindi, and Chinese) on two datasets (VIST and FlintstonesSV). The proposed framework introduces three levels of cultural analysis as evaluation rubrics: 1) Basic Cultural Criteria, 2) Cultural Dimension Guidance, and 3) Cultural Examples Grounding. We evaluate story visualization by use of a novel MLLM-as-Jury approach across all three rubrics and a small-scale human evaluation only on the third rubric. We implement an MLLM-as-jury approach by aggregating scores from three different families of MLLM-as-Judge models. In our experiments, real-world stories generally receive higher cultural appropriateness scores than animated ones, with English tending to score higher than Hindi and Chinese across the evaluated models. Some examples also exhibited culturally inconsistent or stereotypical elements noted by annotators. The proposed progressive evaluation framework has therefore been shown to provide early insights into cultural misalignments in story visualization. Code for this work is made available on https://github.com/janak11111/Cultural{\_}Eval{\_}For{\_}StoryViz"
}Markdown (Informal)
[A Progressive Evaluation Framework for Multicultural Analysis of Story Visualization](https://preview.aclanthology.org/ingest-acl-workshops/2026.gem-main.39/) (Kapuriya et al., GEM 2026)
ACL