@inproceedings{young-2025-information,
title = "Information-theoretic Distinctions Between Deception and Confusion",
author = "Young, Robin",
editor = "Inui, Kentaro and
Sakti, Sakriani and
Wang, Haofen and
Wong, Derek F. and
Bhattacharyya, Pushpak and
Banerjee, Biplab and
Ekbal, Asif and
Chakraborty, Tanmoy and
Singh, Dhirendra Pratap",
booktitle = "Proceedings of the 14th International Joint Conference on Natural Language Processing and the 4th Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics",
month = dec,
year = "2025",
address = "Mumbai, India",
publisher = "The Asian Federation of Natural Language Processing and The Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-ijcnlp-aacl/2025.findings-ijcnlp.15/",
pages = "258--268",
ISBN = "979-8-89176-303-6",
abstract = "We propose an information-theoretic formalization of the distinction between two fundamental AI safety failure modes: deceptive alignment and goal drift. While both can lead to systems that appear misaligned, we demonstrate that they represent distinct forms of information divergence occurring at different interfaces in the human-AI system. Deceptive alignment creates entropy between an agent{'}s true goals and its observable behavior, while goal drift, or confusion, creates entropy between the intended human goal and the agent{'}s actual goal. Though often observationally equivalent, these failures necessitate different interventions. We present a formal model and an illustrative thought experiment to clarify this distinction. We offer a formal language for re-examining prominent alignment challenges observed in Large Language Models (LLMs), offering novel perspectives on their underlying causes."
}Markdown (Informal)
[Information-theoretic Distinctions Between Deception and Confusion](https://preview.aclanthology.org/ingest-ijcnlp-aacl/2025.findings-ijcnlp.15/) (Young, Findings 2025)
ACL
- Robin Young. 2025. Information-theoretic Distinctions Between Deception and Confusion. In Proceedings of the 14th International Joint Conference on Natural Language Processing and the 4th Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics, pages 258–268, Mumbai, India. The Asian Federation of Natural Language Processing and The Association for Computational Linguistics.