@inproceedings{ahn-etal-2020-nlpdove,
title = "{NLPD}ove at {S}em{E}val-2020 Task 12: Improving Offensive Language Detection with Cross-lingual Transfer",
author = "Ahn, Hwijeen and
Sun, Jimin and
Park, Chan Young and
Seo, Jungyun",
editor = "Herbelot, Aurelie and
Zhu, Xiaodan and
Palmer, Alexis and
Schneider, Nathan and
May, Jonathan and
Shutova, Ekaterina",
booktitle = "Proceedings of the Fourteenth Workshop on Semantic Evaluation",
month = dec,
year = "2020",
address = "Barcelona (online)",
publisher = "International Committee for Computational Linguistics",
url = "https://preview.aclanthology.org/jlcl-multiple-ingestion/2020.semeval-1.206/",
doi = "10.18653/v1/2020.semeval-1.206",
pages = "1576--1586",
abstract = "This paper describes our approach to the task of identifying offensive languages in a multilingual setting. We investigate two data augmentation strategies: using additional semi-supervised labels with different thresholds and cross-lingual transfer with data selection. Leveraging the semi-supervised dataset resulted in performance improvements compared to the baseline trained solely with the manually-annotated dataset. We propose a new metric, Translation Embedding Distance, to measure the transferability of instances for cross-lingual data selection. We also introduce various preprocessing steps tailored for social media text along with methods to fine-tune the pre-trained multilingual BERT (mBERT) for offensive language identification. Our multilingual systems achieved competitive results in Greek, Danish, and Turkish at OffensEval 2020."
}
Markdown (Informal)
[NLPDove at SemEval-2020 Task 12: Improving Offensive Language Detection with Cross-lingual Transfer](https://preview.aclanthology.org/jlcl-multiple-ingestion/2020.semeval-1.206/) (Ahn et al., SemEval 2020)
ACL