@inproceedings{zhao-etal-2025-zero,
title = "Zero-Shot Defense Against Toxic Images via Inherent Multimodal Alignment in {LVLM}s",
author = "Zhao, Wei and
Li, Zhe and
Li, Yige and
Sun, Jun",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2025",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/author-page-yu-wang-polytechnic/2025.findings-emnlp.767/",
doi = "10.18653/v1/2025.findings-emnlp.767",
pages = "14232--14246",
ISBN = "979-8-89176-335-7",
abstract = "Large Vision-Language Models (LVLMs) have made significant strides in multimodal comprehension, thanks to extensive pre-training and fine-tuning on large-scale visual datasets. However, despite their robust textual safety mechanisms, they remain vulnerable to harmful visual inputs. Existing safeguards{---}typically relying on pre-filtering or fine-tuning{---}incur high costs and diminish overall utility. To address this critical vulnerability, we introduce SafeCLIP, a lightweight method that leverages LVLMs' inherent multimodal alignment for zero-shot toxic image detection. By projecting CLIP{'}s discarded CLS token into its text space and matching it with toxic descriptors, SafeCLIP detects harmful content without any architectural changes{---}adding minimal latency and enabling dynamic safety corrections during inference and fine-tuning. Experiments show that SafeCLIP achieves a 66.9{\%} defense success rate with only 3.2{\%} false positive rate and 7.2{\%} overhead. In contrast, state-of-the-art methods achieve 52.9{\%} success but have a 10.7{\%} false positive rate and 210{\%} overhead. Our work demonstrates that leveraging inherent multimodal alignment can yield efficient, low-cost LVLM safety. Code is available at \url{anonymous.4open.science/r/safeclip-2C01}."
}Markdown (Informal)
[Zero-Shot Defense Against Toxic Images via Inherent Multimodal Alignment in LVLMs](https://preview.aclanthology.org/author-page-yu-wang-polytechnic/2025.findings-emnlp.767/) (Zhao et al., Findings 2025)
ACL