@inproceedings{leite-etal-2023-noisy,
title = "Noisy Self-Training with Data Augmentations for Offensive and Hate Speech Detection Tasks",
author = "Leite, Jo{\~a}o and
Scarton, Carolina and
Silva, Diego",
editor = "Mitkov, Ruslan and
Angelova, Galia",
booktitle = "Proceedings of the 14th International Conference on Recent Advances in Natural Language Processing",
month = sep,
year = "2023",
address = "Varna, Bulgaria",
publisher = "INCOMA Ltd., Shoumen, Bulgaria",
url = "https://preview.aclanthology.org/jlcl-multiple-ingestion/2023.ranlp-1.68/",
pages = "631--640",
abstract = "Online social media is rife with offensive and hateful comments, prompting the need for their automatic detection given the sheer amount of posts created every second. Creating high-quality human-labelled datasets for this task is difficult and costly, especially because non-offensive posts are significantly more frequent than offensive ones. However, unlabelled data is abundant, easier, and cheaper to obtain. In this scenario, self-training methods, using weakly-labelled examples to increase the amount of training data, can be employed. Recent {\textquotedblleft}noisy{\textquotedblright} self-training approaches incorporate data augmentation techniques to ensure prediction consistency and increase robustness against noisy data and adversarial attacks. In this paper, we experiment with default and noisy self-training using three different textual data augmentation techniques across five different pre-trained BERT architectures varying in size. We evaluate our experiments on two offensive/hate-speech datasets and demonstrate that (i) self-training consistently improves performance regardless of model size, resulting in up to +1.5{\%} F1-macro on both datasets, and (ii) noisy self-training with textual data augmentations, despite being successfully applied in similar settings, decreases performance on offensive and hate-speech domains when compared to the default method, even with state-of-the-art augmentations such as backtranslation."
}
Markdown (Informal)
[Noisy Self-Training with Data Augmentations for Offensive and Hate Speech Detection Tasks](https://preview.aclanthology.org/jlcl-multiple-ingestion/2023.ranlp-1.68/) (Leite et al., RANLP 2023)
ACL