@inproceedings{otey-etal-2025-representing,
title = "Representing and Clustering Errors in Offensive Language Detection",
author = "Otey, Jood and
Biester, Laura and
Wilson, Steven R",
editor = "Ebrahimi, Abteen and
Haider, Samar and
Liu, Emmy and
Haider, Sammar and
Leonor Pacheco, Maria and
Wein, Shira",
booktitle = "Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 4: Student Research Workshop)",
month = apr,
year = "2025",
address = "Albuquerque, USA",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/fix-sig-urls/2025.naacl-srw.36/",
pages = "368--380",
ISBN = "979-8-89176-192-6",
abstract = "Content moderation is essential in preventing the spread of harmful content on the Internet. However, there are instances where moderation fails and it is important to understand when and why that happens. Workflows that aim to uncover a system{'}s weakness typically use clustering of the data points' embeddings to group errors together. In this paper, we evaluate the K-Means clustering of four text representations for the task of offensive language detection in English and Levantine Arabic. We find Sentence-BERT (SBERT) embeddings give the most human-interpretable clustering for English errors and the grouping is mainly based on the targeted group in the text. Meanwhile, SBERT embeddings of Large Language Model (LLM)-generated linguistic features give the most interpretable clustering for Arabic errors."
}
Markdown (Informal)
[Representing and Clustering Errors in Offensive Language Detection](https://preview.aclanthology.org/fix-sig-urls/2025.naacl-srw.36/) (Otey et al., NAACL 2025)
ACL
- Jood Otey, Laura Biester, and Steven R Wilson. 2025. Representing and Clustering Errors in Offensive Language Detection. In Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 4: Student Research Workshop), pages 368–380, Albuquerque, USA. Association for Computational Linguistics.