@inproceedings{basile-etal-2023-zero,
title = "Zero-Shot Data Maps. Efficient Dataset Cartography Without Model Training",
author = "Basile, Angelo and
Franco-Salvador, Marc and
Rosso, Paolo",
editor = "Bouamor, Houda and
Pino, Juan and
Bali, Kalika",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2023",
month = dec,
year = "2023",
address = "Singapore",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/fix-sig-urls/2023.findings-emnlp.554/",
doi = "10.18653/v1/2023.findings-emnlp.554",
pages = "8264--8277",
abstract = "Data Maps (Swayamdipta, et al. 2020) have emerged as a powerful tool for diagnosing large annotated datasets. Given a model fitted on a dataset, these maps show each data instance from the dataset in a 2-dimensional space defined by a) the model{'}s confidence in the true class and b) the variability of this confidence. In previous work, confidence and variability are usually computed using training dynamics, which requires the fitting of a strong model to the dataset. In this paper, we introduce a novel approach: Zero-Shot Data Maps based on fast bi-encoder networks. For each data point, confidence on the true label and variability are computed over the members of an ensemble of zero-shot models constructed with different {---} but semantically equivalent {---} label descriptions, i.e., textual representations of each class in a given label space. We conduct a comparative analysis of maps compiled using traditional training dynamics and our proposed zero-shot models across various datasets. Our findings reveal that Zero-Shot Data Maps generally match those produced by the traditional method while delivering up to a 14x speedup. The code is available [here](https://github.com/symanto-research/zeroshot-cartography)."
}
Markdown (Informal)
[Zero-Shot Data Maps. Efficient Dataset Cartography Without Model Training](https://preview.aclanthology.org/fix-sig-urls/2023.findings-emnlp.554/) (Basile et al., Findings 2023)
ACL