@inproceedings{bu-etal-2025-walk,
title = "Walk in Others' Shoes with a Single Glance: Human-Centric Visual Grounding with Top-View Perspective Transformation",
author = "Bu, Yuqi and
Wu, Xin and
Zhao, Zirui and
Cai, Yi and
Hsu, David and
Liu, Qiong",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingestion-acl-25/2025.acl-long.1306/",
pages = "26904--26923",
ISBN = "979-8-89176-251-0",
abstract = "Visual perspective-taking, an ability to envision others' perspectives from a single self-perspective, is vital in human-robot interactions. Thus, we introduce a human-centric visual grounding task and a dataset to evaluate this ability. Recent advances in vision-language models (VLMs) have shown potential for inferring others' perspectives, yet are insensitive to information differences induced by slight perspective changes. To address this problem, we propose a top-view enhanced perspective transformation (TEP) method, which decomposes the transition from robot to human perspectives through an abstract top-view representation. It unifies perspectives and facilitates the capture of information differences from diverse perspectives. Experimental results show that TEP improves performance by up to 18{\%}, exhibits perspective-taking abilities across various perspectives, and generalizes effectively to robotic and dynamic scenarios."
}
Markdown (Informal)
[Walk in Others’ Shoes with a Single Glance: Human-Centric Visual Grounding with Top-View Perspective Transformation](https://preview.aclanthology.org/ingestion-acl-25/2025.acl-long.1306/) (Bu et al., ACL 2025)
ACL