@inproceedings{liu-etal-2024-revisiting,
title = "Revisiting Who`s Harry Potter: Towards Targeted Unlearning from a Causal Intervention Perspective",
author = "Liu, Yujian and
Zhang, Yang and
Jaakkola, Tommi and
Chang, Shiyu",
editor = "Al-Onaizan, Yaser and
Bansal, Mohit and
Chen, Yun-Nung",
booktitle = "Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/Author-page-Marten-During-lu/2024.emnlp-main.495/",
doi = "10.18653/v1/2024.emnlp-main.495",
pages = "8708--8731",
abstract = "This paper investigates Who`s Harry Potter (WHP), a pioneering yet insufficiently understood method for LLM unlearning. We explore it in two steps. First, we introduce a new task of LLM targeted unlearning, where given an unlearning target (e.g., a person) and some unlearning documents, we aim to unlearn only the information about the target, rather than everything in the unlearning documents. We further argue that a successful unlearning should satisfy criteria such as not outputting gibberish, not fabricating facts about the unlearning target, and not releasing factual information under jailbreak attacks. Second, we construct a causal intervention framework for targeted unlearning, where the knowledge of the unlearning target is modeled as a confounder between LLM input and output, and the unlearning process as a deconfounding process. This framework justifies and extends WHP, deriving a simple unlearning algorithm that includes WHP as a special case. Experiments on existing and new datasets show that our approach, without explicitly optimizing for the aforementioned criteria, achieves competitive performance in all of them."
}
Markdown (Informal)
[Revisiting Who’s Harry Potter: Towards Targeted Unlearning from a Causal Intervention Perspective](https://preview.aclanthology.org/Author-page-Marten-During-lu/2024.emnlp-main.495/) (Liu et al., EMNLP 2024)
ACL