@inproceedings{li-etal-2025-attack,
title = "Attack as Defense: Safeguarding Large Vision-Language Models from Jailbreaking by Adversarial Attacks",
author = "Li, Chongxin and
Wang, Hanzhang and
Fang, Yuchun",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2025",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/author-page-yu-wang-polytechnic/2025.findings-emnlp.1095/",
doi = "10.18653/v1/2025.findings-emnlp.1095",
pages = "20138--20152",
ISBN = "979-8-89176-335-7",
abstract = "Adversarial vulnerabilities in vision-language models pose a critical challenge to the reliability of large language systems, where typographic manipulations and adversarial perturbations can effectively bypass language model defenses. We introduce Attack as Defense (AsD), the first approach to proactively defend at the cross-modality level, embedding protective perturbations in vision to disrupt attacks before they propagate to the language model. By leveraging the semantic alignment between vision and language, AsD enhances adversarial robustness through model perturbations and system-level prompting. Unlike prior work that focuses on text-stage defenses, our method integrates visual defenses to reinforce prompt-based protections, mitigating jailbreaking attacks across benchmarks. Experiments on the LLaVA-1.5 show that AsD reduces attack success rates from 56.7{\%} to 12.6{\%} for typographic attacks and from 89.0{\%} to 47.5{\%} for adversarial perturbations. Further analysis reveals that the key bottleneck in vision-language security lies not in isolated model vulnerabilities, but in cross-modal interactions, where adversarial cues in the vision model fail to consistently activate the defense mechanisms of the language model."
}Markdown (Informal)
[Attack as Defense: Safeguarding Large Vision-Language Models from Jailbreaking by Adversarial Attacks](https://preview.aclanthology.org/author-page-yu-wang-polytechnic/2025.findings-emnlp.1095/) (Li et al., Findings 2025)
ACL