@inproceedings{you-etal-2023-large,
title = "Large Language Models Are Better Adversaries: Exploring Generative Clean-Label Backdoor Attacks Against Text Classifiers",
author = "You, Wencong and
Hammoudeh, Zayd and
Lowd, Daniel",
editor = "Bouamor, Houda and
Pino, Juan and
Bali, Kalika",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2023",
month = dec,
year = "2023",
address = "Singapore",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/add-emnlp-2024-awards/2023.findings-emnlp.833/",
doi = "10.18653/v1/2023.findings-emnlp.833",
pages = "12499--12527",
abstract = "Backdoor attacks manipulate model predictions by inserting innocuous triggers into training and test data. We focus on more realistic and more challenging clean-label attacks where the adversarial training examples are correctly labeled. Our attack, LLMBkd, leverages language models to automatically insert diverse style-based triggers into texts. We also propose a poison selection technique to improve the effectiveness of both LLMBkd as well as existing textual backdoor attacks. Lastly, we describe REACT, a baseline defense to mitigate backdoor attacks via antidote training examples. Our evaluations demonstrate LLMBkd`s effectiveness and efficiency, where we consistently achieve high attack success rates across a wide range of styles with little effort and no model training."
}
Markdown (Informal)
[Large Language Models Are Better Adversaries: Exploring Generative Clean-Label Backdoor Attacks Against Text Classifiers](https://preview.aclanthology.org/add-emnlp-2024-awards/2023.findings-emnlp.833/) (You et al., Findings 2023)
ACL