@inproceedings{you-lowd-2022-towards,
title = "Towards Stronger Adversarial Baselines Through Human-{AI} Collaboration",
author = "You, Wencong and
Lowd, Daniel",
editor = "Shavrina, Tatiana and
Mikhailov, Vladislav and
Malykh, Valentin and
Artemova, Ekaterina and
Serikov, Oleg and
Protasov, Vitaly",
booktitle = "Proceedings of NLP Power! The First Workshop on Efficient Benchmarking in NLP",
month = may,
year = "2022",
address = "Dublin, Ireland",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.nlppower-1.2",
doi = "10.18653/v1/2022.nlppower-1.2",
pages = "11--21",
abstract = "Natural language processing (NLP) systems are often used for adversarial tasks such as detecting spam, abuse, hate speech, and fake news. Properly evaluating such systems requires dynamic evaluation that searches for weaknesses in the model, rather than a static test set. Prior work has evaluated such models on both manually and automatically generated examples, but both approaches have limitations: manually constructed examples are time-consuming to create and are limited by the imagination and intuition of the creators, while automatically constructed examples are often ungrammatical or labeled inconsistently. We propose to combine human and AI expertise in generating adversarial examples, benefiting from humans{'} expertise in language and automated attacks{'} ability to probe the target system more quickly and thoroughly. We present a system that facilitates attack construction, combining human judgment with automated attacks to create better attacks more efficiently. Preliminary results from our own experimentation suggest that human-AI hybrid attacks are more effective than either human-only or AI-only attacks. A complete user study to validate these hypotheses is still pending.",
}
Markdown (Informal)
[Towards Stronger Adversarial Baselines Through Human-AI Collaboration](https://aclanthology.org/2022.nlppower-1.2) (You & Lowd, nlppower 2022)
ACL