@inproceedings{sadana-etal-2026-iso,
title = "{ISO}-Bench: Benchmarking Multimodal Causal Reasoning in Visual{--}Language Models through Procedural Plans",
author = "Sadana, Ananya and
Lal, Yash Kumar and
Zhou, Jiawei",
editor = "Mille, Simon and
Gehrmann, Sebastian and
Schmidtov{\'a}, Patr{\'i}cia and
Du{\v{s}}ek, Ond{\v{r}}ej and
Fadaee, Marzieh and
Lo, Kyle and
Santus, Enrico and
Stanovsky, Gabriel",
booktitle = "Proceedings of the Fifth Workshop on Generation, Evaluation and Metrics ({GEM})",
month = jul,
year = "2026",
address = "San Diego, California, USA",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl-workshops/2026.gem-main.68/",
pages = "797--807",
ISBN = "979-8-89176-423-1",
abstract = "Understanding causal relationships across modalities is a core challenge for multimodal models operating in real-world environments. We introduce ISO-Bench, a benchmark for evaluating whether models can infer causal dependencies between visual observations and procedural text. Each example presents an image of a task step and a text snippet from a plan, with the goal of deciding whether the visual step occurs before or after the referenced text step. Evaluation results on ten frontier vision-language models show underwhelming performance: the best zero-shot F1 is only 0.57, and chain-of-thought reasoning yields only modest gains (up to 0.62 F1), largely behind humans (0.98 F1). Our analysis further highlights concrete directions for improving causal understanding in multimodal models."
}Markdown (Informal)
[ISO-Bench: Benchmarking Multimodal Causal Reasoning in Visual–Language Models through Procedural Plans](https://preview.aclanthology.org/ingest-acl-workshops/2026.gem-main.68/) (Sadana et al., GEM 2026)
ACL