@inproceedings{li-etal-2025-sa,
title = "{SA}-{CLIP}: Language Guided Image Spatial and Action Feature Learning",
author = "Li, Guanlin and
Shao, Wenhao and
Rajapaksha, Praboda and
Crespi, Noel",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2025",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/author-page-yu-wang-polytechnic/2025.findings-emnlp.1134/",
doi = "10.18653/v1/2025.findings-emnlp.1134",
pages = "20808--20814",
ISBN = "979-8-89176-335-7",
abstract = "We observed that Contrastive Language-Image Pretraining (CLIP) models struggle with real-world downstream tasks such as road traffic anomaly detection, due to their inability to effectively capture spatial and action relationships between objects within images. To address this, we compile and curate a dataset with 1M samples of images using language supervision provided by the common image caption dataset, in which each image is paired with subject-relationship-object descriptions emphasizing spatial and action interactions, and train a \textbf{S}patial and \textbf{A}ction relationship aware \textbf{CLIP} (\textbf{SA-CLIP}) model. We evaluated the proposed model on the Visual Spatial Reasoning (VSR) dataset and further verified its effectiveness on the Detection-of-Traffic-Anomaly (DoTA) dataset. Experiment results show that the proposed SA-CLIP demonstrates strong abilities in understanding spatial relationships while achieving good zero-shot performance on the traffic anomaly detection task."
}Markdown (Informal)
[SA-CLIP: Language Guided Image Spatial and Action Feature Learning](https://preview.aclanthology.org/author-page-yu-wang-polytechnic/2025.findings-emnlp.1134/) (Li et al., Findings 2025)
ACL