@inproceedings{toraman-etal-2022-large,
title = "Large-Scale Hate Speech Detection with Cross-Domain Transfer",
author = "Toraman, Cagri and
{\c{S}}ahinu{\c{c}}, Furkan and
Yilmaz, Eyup",
editor = "Calzolari, Nicoletta and
B{\'e}chet, Fr{\'e}d{\'e}ric and
Blache, Philippe and
Choukri, Khalid and
Cieri, Christopher and
Declerck, Thierry and
Goggi, Sara and
Isahara, Hitoshi and
Maegaard, Bente and
Mariani, Joseph and
Mazo, H{\'e}l{\`e}ne and
Odijk, Jan and
Piperidis, Stelios",
booktitle = "Proceedings of the Thirteenth Language Resources and Evaluation Conference",
month = jun,
year = "2022",
address = "Marseille, France",
publisher = "European Language Resources Association",
url = "https://preview.aclanthology.org/add-emnlp-2024-awards/2022.lrec-1.238/",
pages = "2215--2225",
abstract = "The performance of hate speech detection models relies on the datasets on which the models are trained. Existing datasets are mostly prepared with a limited number of instances or hate domains that define hate topics. This hinders large-scale analysis and transfer learning with respect to hate domains. In this study, we construct large-scale tweet datasets for hate speech detection in English and a low-resource language, Turkish, consisting of human-labeled 100k tweets per each. Our datasets are designed to have equal number of tweets distributed over five domains. The experimental results supported by statistical tests show that Transformer-based language models outperform conventional bag-of-words and neural models by at least 5{\%} in English and 10{\%} in Turkish for large-scale hate speech detection. The performance is also scalable to different training sizes, such that 98{\%} of performance in English, and 97{\%} in Turkish, are recovered when 20{\%} of training instances are used. We further examine the generalization ability of cross-domain transfer among hate domains. We show that 96{\%} of the performance of a target domain in average is recovered by other domains for English, and 92{\%} for Turkish. Gender and religion are more successful to generalize to other domains, while sports fail most."
}
Markdown (Informal)
[Large-Scale Hate Speech Detection with Cross-Domain Transfer](https://preview.aclanthology.org/add-emnlp-2024-awards/2022.lrec-1.238/) (Toraman et al., LREC 2022)
ACL