@inproceedings{patel-etal-2025-evaluating,
title = "Evaluating Large Language Models for Detecting Antisemitism",
author = "Patel, Jay and
Mehta, Hrudayangam and
Blackburn, Jeremy",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-emnlp/2025.emnlp-main.1792/",
pages = "35356--35385",
ISBN = "979-8-89176-332-6",
abstract = "Detecting hateful content is a challenging and important problem. Automated tools, like machine{-}learning models, can help, but they require continuous training to adapt to the ever-changing landscape of social media. In this work, we evaluate eight open-source LLMs' capability to detect antisemitic content, specifically leveraging in-context definition as a policy guideline. We explore various prompting techniques and design a new CoT-like prompt, Guided-CoT. Guided{-}CoT handles the in-context policy well, increasing performance across all evaluated models, regardless of decoding configuration, model sizes, or reasoning capability. Notably, Llama 3.1 70B outperforms fine-tuned GPT-3.5. Additionally, we examine LLM errors and introduce metrics to quantify semantic divergence in model-generated rationales, revealing notable differences and paradoxical behaviors among LLMs. Our experiments highlight the differences observed across LLMs' utility, explainability, and reliability."
}Markdown (Informal)
[Evaluating Large Language Models for Detecting Antisemitism](https://preview.aclanthology.org/ingest-emnlp/2025.emnlp-main.1792/) (Patel et al., EMNLP 2025)
ACL