@inproceedings{hsia-etal-2024-goodharts,
title = "Goodhart`s Law Applies to {NLP}`s Explanation Benchmarks",
author = "Hsia, Jennifer and
Pruthi, Danish and
Singh, Aarti and
Lipton, Zachary",
editor = "Graham, Yvette and
Purver, Matthew",
booktitle = "Findings of the Association for Computational Linguistics: EACL 2024",
month = mar,
year = "2024",
address = "St. Julian{'}s, Malta",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/add-emnlp-2024-awards/2024.findings-eacl.88/",
pages = "1322--1335",
abstract = "Despite the rising popularity of saliency-based explanations, the research community remains at an impasse, facing doubts concerning their purpose, efficacy, and tendency to contradict each other. Seeking to unite the community`s efforts around common goals, several recent works have proposed evaluation metrics. In this paper, we critically examine two sets of metrics: the ERASER metrics (comprehensiveness and sufficiency) and the EVAL-X metrics, focusing our inquiry on natural language processing. First, we show that we can inflate a model`s comprehensiveness and sufficiency scores dramatically without altering its predictions or explanations on in-distribution test inputs. Our strategy exploits the tendency for extracted explanations and their complements to be {\textquotedblleft}out-of-support{\textquotedblright} relative to each other and in-distribution inputs. Next, we demonstrate that the EVAL-X metrics can be inflated arbitrarily by a simple method that encodes the label, even though EVAL-X is precisely motivated to address such exploits. Our results raise doubts about the ability of current metrics to guide explainability research, underscoring the need for a broader reassessment of what precisely these metrics are intended to capture."
}
Markdown (Informal)
[Goodhart’s Law Applies to NLP’s Explanation Benchmarks](https://preview.aclanthology.org/add-emnlp-2024-awards/2024.findings-eacl.88/) (Hsia et al., Findings 2024)
ACL
- Jennifer Hsia, Danish Pruthi, Aarti Singh, and Zachary Lipton. 2024. Goodhart’s Law Applies to NLP’s Explanation Benchmarks. In Findings of the Association for Computational Linguistics: EACL 2024, pages 1322–1335, St. Julian’s, Malta. Association for Computational Linguistics.