@inproceedings{li-etal-2025-change,
title = "Change Entity-guided Heterogeneous Representation Disentangling for Change Captioning",
author = "Li, Yi and
Tu, Yunbin and
Li, Liang and
Su, Li and
Huang, Qingming",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2025",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingestion-acl-25/2025.findings-acl.876/",
pages = "17050--17060",
ISBN = "979-8-89176-256-5",
abstract = "Change captioning aims to describe differences between a pair of images using natural language. However, learning effective difference representations is highly challenging due to distractors such as illumination and viewpoint changes. To address this, we propose a change-entity-guided disentanglement network that explicitly learns difference representations while mitigating the impact of distractors. Specifically, we first design a change entity retrieval module to identify key objects involved in the change from a textual perspective. Then, we introduce a difference representation enhancement module that strengthens the learned features, disentangling genuine differences from background variations. To further refine the generation process, we incorporate a gated Transformer decoder, which dynamically integrates both visual difference and textual change-entity information. Extensive experiments on CLEVR-Change, CLEVR-DC and Spot-the-Diff datasets demonstrate that our method outperforms existing approaches, achieving state-of-the-art performance. The code is available at https://github.com/yili-19/CHEER"
}
Markdown (Informal)
[Change Entity-guided Heterogeneous Representation Disentangling for Change Captioning](https://preview.aclanthology.org/ingestion-acl-25/2025.findings-acl.876/) (Li et al., Findings 2025)
ACL