@inproceedings{pala-etal-2025-ferret,
title = "Ferret: Faster and Effective Automated Red Teaming with Reward-Based Scoring Technique",
author = "Pala, Tej Deep and
Toh, Vernon and
Bhardwaj, Rishabh and
Poria, Soujanya",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2025",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/author-page-yu-wang-polytechnic/2025.findings-emnlp.634/",
doi = "10.18653/v1/2025.findings-emnlp.634",
pages = "11845--11860",
ISBN = "979-8-89176-335-7",
abstract = "As large language models (LLMs) are increasingly integrated into real-world applications, ensuring their safety and robustness is critical. Automated red-teaming methods generate adversarial attacks to identify vulnerabilities, but existing approaches often face challenges like slow performance, limited categorical diversity, and high resource demands. We propose Ferret, a novel method that enhances the baseline, Rainbow Teaming by generating multiple adversarial prompt mutations per iteration and ranking them using scoring functions such as reward models, Llama Guard, and LLM-as-a-judge. Ferret achieves a 95{\%} attack success rate (ASR), a 46{\%} improvement over baseline, and reduces time to a 90{\%} ASR by 15.2{\%}. Additionally, it generates transferable adversarial prompts effective on larger LLMs. Our code is available at https://github.com/declare-lab/ferret"
}Markdown (Informal)
[Ferret: Faster and Effective Automated Red Teaming with Reward-Based Scoring Technique](https://preview.aclanthology.org/author-page-yu-wang-polytechnic/2025.findings-emnlp.634/) (Pala et al., Findings 2025)
ACL