@inproceedings{le-etal-2021-sweet,
title = "A Sweet Rabbit Hole by {DARCY}: Using Honeypots to Detect Universal Trigger{'}s Adversarial Attacks",
author = "Le, Thai and
Park, Noseong and
Lee, Dongwon",
editor = "Zong, Chengqing and
Xia, Fei and
Li, Wenjie and
Navigli, Roberto",
booktitle = "Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing (Volume 1: Long Papers)",
month = aug,
year = "2021",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/fix-sig-urls/2021.acl-long.296/",
doi = "10.18653/v1/2021.acl-long.296",
pages = "3831--3844",
abstract = "The Universal Trigger (UniTrigger) is a recently-proposed powerful adversarial textual attack method. Utilizing a learning-based mechanism, UniTrigger generates a fixed phrase that, when added to any benign inputs, can drop the prediction accuracy of a textual neural network (NN) model to near zero on a target class. To defend against this attack that can cause significant harm, in this paper, we borrow the ``honeypot'' concept from the cybersecurity community and propose DARCY, a honeypot-based defense framework against UniTrigger. DARCY greedily searches and injects multiple trapdoors into an NN model to ``bait and catch'' potential attacks. Through comprehensive experiments across four public datasets, we show that DARCY detects UniTrigger{'}s adversarial attacks with up to 99{\%} TPR and less than 2{\%} FPR in most cases, while maintaining the prediction accuracy (in F1) for clean inputs within a 1{\%} margin. We also demonstrate that DARCY with multiple trapdoors is also robust to a diverse set of attack scenarios with attackers' varying levels of knowledge and skills. We release the source code of DARCY at: \url{https://github.com/lethaiq/ACL2021-DARCY-HoneypotDefenseNLP}."
}
Markdown (Informal)
[A Sweet Rabbit Hole by DARCY: Using Honeypots to Detect Universal Trigger’s Adversarial Attacks](https://preview.aclanthology.org/fix-sig-urls/2021.acl-long.296/) (Le et al., ACL-IJCNLP 2021)
ACL