@inproceedings{geng-etal-2025-mathsf,
title = "$\mathsf{Con Instruction}$: Universal Jailbreaking of Multimodal Large Language Models via Non-Textual Modalities",
author = "Geng, Jiahui and
Tran, Thy Thy and
Nakov, Preslav and
Gurevych, Iryna",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingestion-acl-25/2025.acl-long.146/",
pages = "2917--2933",
ISBN = "979-8-89176-251-0",
abstract = "Existing attacks against multimodal language models often communicate instruction through text, either as an explicit malicious instruction or a crafted generic prompt, and accompanied by a toxic image. In contrast, here we exploit the capabilities of MLLMs in following non-textual instruction, i.e., an adversarial image or audio, namely Con Instruction. It is a novel gray-box attack method that generates adversarial images or audio to convey specific harmful instructions to MLLMs. We also find that combining our adversarial examples with certain non-empty text inputs amplifies attack success, while appending these after malicious text has limited effects. To evaluate whether an attack is successful, we introduce a new attack response categorization (ARC) that considers the response quality and relevancy concerning the malicious instruction. The results show that Con Instruction effectively bypasses the safety mechanisms in various visual and audio-language models, including LLaVA-v1.5, InternVL, Qwen-VL, and Qwen-Audio, across two standard benchmarks: AdvBench and SafeBench. Specifically, our method achieves the highest attack success rates, reaching 81.3{\%} and 86.6{\%} on LLaVA-v1.5 (13B). We show that larger models are more susceptible toCon Instruction, contrasting observations in their underlying LLMs. On the defense side, we explore various methods against our attacks and find substantial gaps among existing techniques. The code will be made available upon publication."
}
Markdown (Informal)
[\mathsf{Con Instruction}: Universal Jailbreaking of Multimodal Large Language Models via Non-Textual Modalities](https://preview.aclanthology.org/ingestion-acl-25/2025.acl-long.146/) (Geng et al., ACL 2025)
ACL