@inproceedings{hao-2020-evaluating,
title = "Evaluating Attribution Methods using White-Box {LSTM}s",
author = "Hao, Yiding",
editor = "Alishahi, Afra and
Belinkov, Yonatan and
Chrupa{\l}a, Grzegorz and
Hupkes, Dieuwke and
Pinter, Yuval and
Sajjad, Hassan",
booktitle = "Proceedings of the Third BlackboxNLP Workshop on Analyzing and Interpreting Neural Networks for NLP",
month = nov,
year = "2020",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/jlcl-multiple-ingestion/2020.blackboxnlp-1.28/",
doi = "10.18653/v1/2020.blackboxnlp-1.28",
pages = "300--313",
abstract = "Interpretability methods for neural networks are difficult to evaluate because we do not understand the black-box models typically used to test them. This paper proposes a framework in which interpretability methods are evaluated using manually constructed networks, which we call white-box networks, whose behavior is understood a priori. We evaluate five methods for producing attribution heatmaps by applying them to white-box LSTM classifiers for tasks based on formal languages. Although our white-box classifiers solve their tasks perfectly and transparently, we find that all five attribution methods fail to produce the expected model explanations."
}
Markdown (Informal)
[Evaluating Attribution Methods using White-Box LSTMs](https://preview.aclanthology.org/jlcl-multiple-ingestion/2020.blackboxnlp-1.28/) (Hao, BlackboxNLP 2020)
ACL