@inproceedings{gregorio-etal-2026-measuring,
title = "Measuring the Effects of Visual Salience in Human and {AI} Descriptions with Image Editing",
author = "Gregorio, Nina and
Ponti, Edoardo and
Goldwater, Sharon",
editor = "Bonial, Claire and
Berzak, Yevgeni",
booktitle = "Proceedings of the 30th Conference on Computational Natural Language Learning",
month = jul,
year = "2026",
address = "San Diego, California, USA",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl-workshops/2026.conll-main.18/",
pages = "301--317",
ISBN = "979-8-89176-410-1",
abstract = "How does our perception of the world influence the way we talk about it? Psycholinguistic studies have investigated whether visual salience correlates with entity mention and ordering, but often disregarded its effect on grammar or relied on simplistic images or artificial cues. In this study, we explore the use of generative AI to better control for salience in visual stimuli while keeping them realistic, and to serve as a proxy for human participants in studying how different types of salience impact image descriptions.We consider three salience types: *perceptual* (e.g. relative size in the image), *inherent* (e.g. animacy), and *relational* (e.g. human{--}object interaction). We first analyze human- and AI-generated captions for natural images to examine how salience correlates with how early, and in what grammatical role, an entity is mentioned. We find strong correlations between models and humans in this observational study, justifying the use of AI models alone in a further causal study. For this second study, we created datasets composed of pairs of images, where we used an image-editing model to intervene on the salience of a target entity. We show that relational and perceptual salience lead to the entity being mentioned earlier in captions and being mapped to more prominent grammatical roles. The magnitude of this effect varies across entity types, with animate entities (high inherent salience) showing a particularly distinct pattern."
}Markdown (Informal)
[Measuring the Effects of Visual Salience in Human and AI Descriptions with Image Editing](https://preview.aclanthology.org/ingest-acl-workshops/2026.conll-main.18/) (Gregorio et al., CoNLL 2026)
ACL