@inproceedings{carvalho-etal-2026-unsupervised,
title = "Unsupervised Evaluation of Explanations for Hate Speech Classification in {P}ortuguese",
author = "Carvalho, Isabel and
Oliveira, Hugo Gon{\c{c}}alo and
Silva, Catarina",
editor = "Souza, Marlo and
de-Dios-Flores, Iria and
Santos, Diana and
Freitas, Larissa and
Souza, Jackson Wilke da Cruz and
Ribeiro, Eug{\'e}nio",
booktitle = "Proceedings of the 17th International Conference on Computational Processing of {P}ortuguese ({PROPOR} 2026) - Vol. 1",
month = apr,
year = "2026",
address = "Salvador, Brazil",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-dnd/2026.propor-1.77/",
pages = "780--789",
ISBN = "979-8-89176-387-6",
abstract = "Top-performing Artificial Intelligence models often operate as black boxes. Explainable AI (XAI) can increase transparency, but its evaluation is currently hindered by a lack of annotated explanation data and agreed-upon validation standards. We propose a framework for evaluating the faithfulness of explanations in Portuguese hate speech detection. Our approach is based on the premise that a faithful explanation should identify features whose removal degrades a model{'}s performance. We follow a three-step process: (i) prediction on the original input; (ii) identification and removal of explanatory keywords; and (iii), prediction on the modified input, with performance differences used as an evaluation signal. We conduct experiments using ensemble classifiers, multiple keyword selection strategies, and SHAP and LIME as XAI methods. In addition, Large Language Models (LLMs) are explored both as classifiers and as explainers. Results demonstrate that removing explanatory keywords degrades model performance more than random word removal, indicating explanation faithfulness. Notably, SHAP and LIME consistently provided more faithful explanations than LLM-generated or manual alternatives, although impact depends on the keyword selection strategy. These findings highlight the importance of standardised, unsupervised evaluation protocols for XAI and the faithfulness limitations of current generative LLM explanations."
}Markdown (Informal)
[Unsupervised Evaluation of Explanations for Hate Speech Classification in Portuguese](https://preview.aclanthology.org/ingest-dnd/2026.propor-1.77/) (Carvalho et al., PROPOR 2026)
ACL