@inproceedings{zhu-rudzicz-2020-information,
title = "An information theoretic view on selecting linguistic probes",
author = "Zhu, Zining and
Rudzicz, Frank",
editor = "Webber, Bonnie and
Cohn, Trevor and
He, Yulan and
Liu, Yang",
booktitle = "Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP)",
month = nov,
year = "2020",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/jlcl-multiple-ingestion/2020.emnlp-main.744/",
doi = "10.18653/v1/2020.emnlp-main.744",
pages = "9251--9262",
abstract = "There is increasing interest in assessing the linguistic knowledge encoded in neural representations. A popular approach is to attach a diagnostic classifier {--} or {\textquotedblright}probe{\textquotedblright} {--} to perform supervised classification from internal representations. However, how to select a good probe is in debate. Hewitt and Liang (2019) showed that a high performance on diagnostic classification itself is insufficient, because it can be attributed to either {\textquotedblright}the representation being rich in knowledge{\textquotedblright}, or {\textquotedblright}the probe learning the task{\textquotedblright}, which Pimentel et al. (2020) challenged. We show this dichotomy is valid information-theoretically. In addition, we find that the {\textquotedblright}good probe{\textquotedblright} criteria proposed by the two papers, *selectivity* (Hewitt and Liang, 2019) and *information gain* (Pimentel et al., 2020), are equivalent {--} the errors of their approaches are identical (modulo irrelevant terms). Empirically, these two selection criteria lead to results that highly agree with each other."
}
Markdown (Informal)
[An information theoretic view on selecting linguistic probes](https://preview.aclanthology.org/jlcl-multiple-ingestion/2020.emnlp-main.744/) (Zhu & Rudzicz, EMNLP 2020)
ACL