@inproceedings{ch-wang-etal-2024-androids,
title = "Do Androids Know They`re Only Dreaming of Electric Sheep?",
author = "CH-Wang, Sky and
Van Durme, Benjamin and
Eisner, Jason and
Kedzie, Chris",
editor = "Ku, Lun-Wei and
Martins, Andre and
Srikumar, Vivek",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2024",
month = aug,
year = "2024",
address = "Bangkok, Thailand",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/add-emnlp-2024-awards/2024.findings-acl.260/",
doi = "10.18653/v1/2024.findings-acl.260",
pages = "4401--4420",
abstract = "We design probes trained on the internal representations of a transformer language model to predict its hallucinatory behavior on three grounded generation tasks. To train the probes, we annotate for span-level hallucination on both sampled (organic) and manually edited (synthetic) reference outputs. Our probes are narrowly trained and we find that they are sensitive to their training domain: they generalize poorly from one task to another or from synthetic to organic hallucinations. However, on in-domain data, they can reliably detect hallucinations at many transformer layers, achieving 95{\%} of their peak performance as early as layer 4. Here, probing proves accurate for evaluating hallucination, outperforming several contemporary baselines and even surpassing an expert human annotator in response-level detection F1. Similarly, on span-level labeling, probes are on par or better than the expert annotator on two out of three generation tasks. Overall, we find that probing is a feasible and efficient alternative to language model hallucination evaluation when model states are available."
}
Markdown (Informal)
[Do Androids Know They’re Only Dreaming of Electric Sheep?](https://preview.aclanthology.org/add-emnlp-2024-awards/2024.findings-acl.260/) (CH-Wang et al., Findings 2024)
ACL