@inproceedings{tang-etal-2022-identifying,
title = "Identifying the Source of Vulnerability in Explanation Discrepancy: A Case Study in Neural Text Classification",
author = "Tang, Ruixuan and
Chen, Hanjie and
Ji, Yangfeng",
editor = "Bastings, Jasmijn and
Belinkov, Yonatan and
Elazar, Yanai and
Hupkes, Dieuwke and
Saphra, Naomi and
Wiegreffe, Sarah",
booktitle = "Proceedings of the Fifth BlackboxNLP Workshop on Analyzing and Interpreting Neural Networks for NLP",
month = dec,
year = "2022",
address = "Abu Dhabi, United Arab Emirates (Hybrid)",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/jlcl-multiple-ingestion/2022.blackboxnlp-1.30/",
doi = "10.18653/v1/2022.blackboxnlp-1.30",
pages = "356--370",
abstract = "Some recent works observed the instability of post-hoc explanations when input side perturbations are applied to the model. This raises the interest and concern in the stability of post-hoc explanations. However, the remaining question is: is the instability caused by the neural network model or the post-hoc explanation method? This work explores the potential source that leads to unstable post-hoc explanations. To separate the influence from the model, we propose a simple output probability perturbation method. Compared to prior input side perturbation methods, the output probability perturbation method can circumvent the neural model`s potential effect on the explanations and allow the analysis on the explanation method. We evaluate the proposed method with three widely-used post-hoc explanation methods (LIME (Ribeiro et al., 2016), Kernel Shapley (Lundberg and Lee, 2017a), and Sample Shapley (Strumbelj and Kononenko, 2010)). The results demonstrate that the post-hoc methods are stable, barely producing discrepant explanations under output probability perturbations. The observation suggests that neural network models may be the primary source of fragile explanations."
}
Markdown (Informal)
[Identifying the Source of Vulnerability in Explanation Discrepancy: A Case Study in Neural Text Classification](https://preview.aclanthology.org/jlcl-multiple-ingestion/2022.blackboxnlp-1.30/) (Tang et al., BlackboxNLP 2022)
ACL