@inproceedings{berezin-etal-2025-tip,
title = "The {TIP} of the Iceberg: Revealing a Hidden Class of Task-in-Prompt Adversarial Attacks on {LLM}s",
author = "Berezin, Sergey and
Farahbakhsh, Reza and
Crespi, Noel",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingestion-acl-25/2025.acl-long.334/",
pages = "6716--6730",
ISBN = "979-8-89176-251-0",
abstract = "We present a novel class of jailbreak adversarial attacks on LLMs, termed Task-in-Prompt (TIP) attacks. Our approach embeds sequence-to-sequence tasks (e.g., cipher decoding, riddles, code execution) into the model{'}s prompt to indirectly generate prohibited inputs. To systematically assess the effectiveness of these attacks, we introduce the PHRYGE benchmark. We demonstrate that our techniques successfully circumvent safeguards in six state-of-the-art language models, including GPT-4o and LLaMA 3.2. Our findings highlight critical weaknesses in current LLM safety alignment and underscore the urgent need for more sophisticated defence strategies."
}
Markdown (Informal)
[The TIP of the Iceberg: Revealing a Hidden Class of Task-in-Prompt Adversarial Attacks on LLMs](https://preview.aclanthology.org/ingestion-acl-25/2025.acl-long.334/) (Berezin et al., ACL 2025)
ACL