@inproceedings{wang-etal-2026-reference,
title = "Reference Attack: A New Cross-Modal Jailbreaking Attack against Multimodal Large Language Models",
author = "Wang, Yulong and
Fu, Yifei and
Gao, Jiayi",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Proceedings of the 64th Annual Meeting of the {A}ssociation for {C}omputational {L}inguistics (Volume 1: Long Papers)",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl/2026.acl-long.812/",
pages = "17860--17881",
ISBN = "979-8-89176-390-6",
abstract = "Red team testing, an effective proactive method for evaluating the security of multimodal large language models (MLLMs), requires an expanding toolkit alongside the development of MLLM safeguards. We propose the Reference Attack, a powerful tool for red team testing against MLLMs. The Reference Attack is a reference-guided cross-modal jailbreak method that enhances existing prompt-to-image injection attacks by exploiting MLLMs' semantic reconstruction capabilities. Our method embeds malicious prompts in non-text modalities (e.g., images, spreadsheets) and constructs recursive symbolic references in text, enabling MLLMs to gradually recover and generate harmful content through layered reference resolution.The attack introduces a new vector that circumvents conventional content moderation by exploiting MLLMs' lack of security checks during cross-modal reference resolution. We evaluate the Reference Attack on leading MLLMs, including ChatGPT, Gemini, Claude, and the widely used open-source LLaMA model, and achieved an attack success rate of over 93{\%} across all tested models. Compared to state-of-the-art attacks, Reference Attack achieves higher success rates than all baselines under identical evaluation, with a maximum gain of 70.8{\%}. Our study reveals a critical gap in MLLM security and highlights the need for strict security auditing of cross-modal interactions in future content moderation."
}Markdown (Informal)
[Reference Attack: A New Cross-Modal Jailbreaking Attack against Multimodal Large Language Models](https://preview.aclanthology.org/ingest-acl/2026.acl-long.812/) (Wang et al., ACL 2026)
ACL