@inproceedings{calizzano-etal-2021-dfki,
title = "{DFKI} {SLT} at {G}erm{E}val 2021: Multilingual Pre-training and Data Augmentation for the Classification of Toxicity in Social Media Comments",
author = "Calizzano, Remi and
Ostendorff, Malte and
Rehm, Georg",
editor = "Risch, Julian and
Stoll, Anke and
Wilms, Lena and
Wiegand, Michael",
booktitle = "Proceedings of the GermEval 2021 Shared Task on the Identification of Toxic, Engaging, and Fact-Claiming Comments",
month = sep,
year = "2021",
address = "Duesseldorf, Germany",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/jlcl-multiple-ingestion/2021.germeval-1.4/",
pages = "25--31",
abstract = "We present our submission to the first subtask of GermEval 2021 (classification of German Facebook comments as toxic or not). Binary sequence classification is a standard NLP task with known state-of-the-art methods. Therefore, we focus on data preparation by using two different techniques: task-specific pre-training and data augmentation. First, we pre-train multilingual transformers (XLM-RoBERTa and MT5) on 12 hatespeech detection datasets in nine different languages. In terms of F1, we notice an improvement of 10{\%} on average, using task-specific pre-training. Second, we perform data augmentation by labelling unlabelled comments, taken from Facebook, to increase the size of the training dataset by 79{\%}. Models trained on the augmented training dataset obtain on average +0.0282 (+5{\%}) F1 score compared to models trained on the original training dataset. Finally, the combination of the two techniques allows us to obtain an F1 score of 0.6899 with XLM- RoBERTa and 0.6859 with MT5. The code of the project is available at: \url{https://github.com/airKlizz/germeval2021toxic}."
}
Markdown (Informal)
[DFKI SLT at GermEval 2021: Multilingual Pre-training and Data Augmentation for the Classification of Toxicity in Social Media Comments](https://preview.aclanthology.org/jlcl-multiple-ingestion/2021.germeval-1.4/) (Calizzano et al., GermEval 2021)
ACL