@inproceedings{fasching-lelkes-2025-model,
title = "Model-Dependent Moderation: Inconsistencies in Hate Speech Detection Across {LLM}-based Systems",
author = "Fasching, Neil and
Lelkes, Yphtach",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2025",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/display_plenaries/2025.findings-acl.1144/",
pages = "22271--22285",
ISBN = "979-8-89176-256-5",
abstract = "Content moderation systems powered by large language models (LLMs) are increasingly deployed to detect hate speech; however, no systematic comparison exists between different systems. If different systems produce different outcomes for the same content, it undermines consistency and predictability, leading to moderation decisions that appear arbitrary or unfair. Analyzing seven leading models{---}dedicated Moderation Endpoints (OpenAI, Mistral), frontier LLMs (Claude 3.5 Sonnet, GPT-4o, Mistral Large, DeepSeek V3), and specialized content moderation APIs (Google Perspective API){---}we demonstrate that moderation system choice fundamentally determines hate speech classification outcomes. Using a novel synthetic dataset of 1.3+ million sentences from a factorial design, we find identical content receives markedly different classification values across systems, with variations especially pronounced for specific demographic groups. Analysis across 125 distinct groups reveals these divergences reflect systematic differences in how models establish decision boundaries around harmful content, highlighting significant implications for automated content moderation."
}
Markdown (Informal)
[Model-Dependent Moderation: Inconsistencies in Hate Speech Detection Across LLM-based Systems](https://preview.aclanthology.org/display_plenaries/2025.findings-acl.1144/) (Fasching & Lelkes, Findings 2025)
ACL