@inproceedings{yu-etal-2026-now,
title = "Now You Hear Me: Audio Narrative Attacks Against Large Audio{--}Language Models",
author = "Yu, Ye and
Jin, Haibo and
Yu, Yaoning and
Zhuang, Jun and
Wang, Haohan",
editor = "Demberg, Vera and
Inui, Kentaro and
Marquez, Llu{\'i}s",
booktitle = "Proceedings of the 19th Conference of the {E}uropean Chapter of the {A}ssociation for {C}omputational {L}inguistics (Volume 1: Long Papers)",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-eacl/2026.eacl-long.278/",
pages = "5925--5939",
ISBN = "979-8-89176-380-7",
abstract = "Large audio-language models increasingly operate on raw speech inputs, enabling more seamless integration across domains such as voice assistants, education, and clinical triage. This transition, however, introduces a distinct class of vulnerabilities that remain largely uncharacterized. We examine the security implications of this modality shift by designing a text-to-audio jailbreak that embeds disallowed directives within a narrative-style audio stream. The attack leverages an advanced instruction-following text-to-speech (TTS) model to exploit structural and acoustic properties, thereby circumventing safety mechanisms primarily calibrated for text. When delivered through synthetic speech, the narrative format elicits restricted outputs from state-of-the-art models, including Gemini 2.0 Flash, achieving a 98.26{\%} success rate that substantially exceeds text-only baselines. These results highlight the need for safety frameworks that jointly reason over linguistic and paralinguistic representations, particularly as speech-based interfaces become more prevalent."
}Markdown (Informal)
[Now You Hear Me: Audio Narrative Attacks Against Large Audio–Language Models](https://preview.aclanthology.org/ingest-eacl/2026.eacl-long.278/) (Yu et al., EACL 2026)
ACL