@inproceedings{zheng-etal-2026-validating,
title = "Validating Automatic Evaluation of Controllable Counterspeech Generation: Rankings Matter More Than Scores",
author = {Zheng, Yi and
Ross, Bj{\"o}rn and
Magdy, Walid},
editor = "Demberg, Vera and
Inui, Kentaro and
Marquez, Llu{\'i}s",
booktitle = "Proceedings of the 19th Conference of the {E}uropean Chapter of the {A}ssociation for {C}omputational {L}inguistics (Volume 1: Long Papers)",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-eacl/2026.eacl-long.193/",
pages = "4131--4146",
ISBN = "979-8-89176-380-7",
abstract = "Counterspeech generation has emerged as a promising approach to combat online hate speech, with recent work focusing on controlling attributes used in counterspeech, such as strategies or intents. While these attributes are often evaluated automatically using classifiers, a key goal of this evaluation is to compare the performance of different generation models. However, the validity of such evaluation results is questionable when the classifiers themselves have only modest performance. This paper examines the automatic evaluation of counterspeech attributes using a multi-attribute counterspeech dataset containing 2,728 samples. We investigate when automatic evaluation can be trusted for model comparison and address the limitations of current evaluation methodologies. We make concrete recommendations for how to perform classifier validation before model evaluation. Our classifier validation results demonstrate that even limited classifiers can produce trustworthy model rankings. Therefore, we argue that when comparing counterspeech generation models, a classifier{'}s ability to rank generation models is a more direct measure of its practical utility than traditional classification metrics, e.g., accuracy and F1."
}Markdown (Informal)
[Validating Automatic Evaluation of Controllable Counterspeech Generation: Rankings Matter More Than Scores](https://preview.aclanthology.org/ingest-eacl/2026.eacl-long.193/) (Zheng et al., EACL 2026)
ACL