@inproceedings{lee-lee-2026-toxiprompt,
title = "{T}oxi{P}rompt: A Two-Stage Red-Teaming Approach for Balancing Adversarial Prompt Diversity and Response Toxicity",
author = "Lee, Seungho and
Lee, Kyumin",
editor = "Demberg, Vera and
Inui, Kentaro and
Marquez, Llu{\'i}s",
booktitle = "Proceedings of the 19th Conference of the {E}uropean Chapter of the {A}ssociation for {C}omputational {L}inguistics (Volume 1: Long Papers)",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/manual-author-scripts/2026.eacl-long.170/",
pages = "3682--3696",
ISBN = "979-8-89176-380-7",
abstract = "While large language models (LLMs) offer great promise, they also pose concrete safety risks. To audit and mitigate these risks, researchers have developed automated red-teaming methods, which generate adversarial prompts to elicit unsafe behavior of target LLMs during evaluation. Recent automated red-teaming methods for LLMs face a persistent trade-off: techniques that increase prompt diversity often reduce the level of the toxicity elicited from the target LLMs, while toxicity-maximizing methods tend to collapse diversity. To address the limitations, we propose ToxiPrompt, a two-stage framework that explicitly separates exploration (diversity) from exploitation (toxicity) and reunifies them with a single selection criterion to balance between diversity and toxicity. Experimental results show that ToxiPrompt outperforms four state-of-the-art baselines in both adversarial prompt diversity and the level of elicited toxicity from target LLMs, improving 14.6{\%} harmonic mean of toxicity and diversity against the best baseline. The approach also performs well for multiple instruction-tuned target LLMs (Llama-2/3, Qwen, Mistral) without re-tuning, achieving up to 55{\%} harmonic mean improvement against the best baseline. Our code is available at https://github.com/seungho715/ToxiPrompt"
}Markdown (Informal)
[ToxiPrompt: A Two-Stage Red-Teaming Approach for Balancing Adversarial Prompt Diversity and Response Toxicity](https://preview.aclanthology.org/manual-author-scripts/2026.eacl-long.170/) (Lee & Lee, EACL 2026)
ACL