@inproceedings{he-etal-2026-recon,
title = "{R}e{C}on: Active Defense against Large Vision-Language Model Jailbreaks via Reverse Safety Concept Injection",
author = "He, Zheng and
Wang, Yiwei and
Wang, Hongxing and
Cai, Yujun",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {ACL} 2026",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl/2026.findings-acl.1173/",
pages = "23427--23441",
ISBN = "979-8-89176-395-1",
abstract = "Large Vision-Language Models (LVLMs) confront an escalating threat from sophisticated multimodal jailbreak attacks. However, existing defense strategies suffer from three critical limitations: (1) the neglect of visual threats; (2) a lack of fine-grained specificity regarding specific attack semantics; and (3) the absence of a dedicated jailbreak detection mechanism, which leads to unnecessary defensive measures against benign inputs. To address these limitations, we propose ReCon, a novel black-box defense framework. ReCon integrates a diffusion-based image purifier to neutralize visual perturbations and an autoencoder-based detector for anomaly filtration. At its core, it employs a Reverse Safety Concept Injection module that maps detected unsafe concepts to fine-grained, constructive Safe Concepts, generating targeted prompts to precisely rectify attack semantics. Extensive experiments demonstrate that ReCon significantly enhances the robustness of LVLMs against jailbreak attacks while preserving performance on benign tasks. Disclaimer: Samples in this paper may be harmful and cause discomfort."
}Markdown (Informal)
[ReCon: Active Defense against Large Vision-Language Model Jailbreaks via Reverse Safety Concept Injection](https://preview.aclanthology.org/ingest-acl/2026.findings-acl.1173/) (He et al., Findings 2026)
ACL