@inproceedings{cheng-etal-2025-fully,
title = "A Fully Probabilistic Perspective on Large Language Model Unlearning: Evaluation and Optimization",
author = "Cheng, Anda and
Huang, Wei and
Wang, Yinggui",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-emnlp/2025.emnlp-main.452/",
pages = "8943--8954",
ISBN = "979-8-89176-332-6",
abstract = "Large Language Model Unlearning (LLMU) is a promising way to remove private or sensitive information from large language models. However, the comprehensive evaluation of LLMU remains underexplored. The dominant deterministic evaluation can yield overly optimistic assessments of unlearning efficacy. To mitigate this, we propose a Fully Probabilistic Evaluation (FPE) framework that incorporates input and output distributions in LLMU evaluation. FPE obtains a probabilistic evaluation result by querying unlearned models with various semantically similar inputs and multiple sampling attempts. We introduce an Input Distribution Sampling method in FPE to select high-quality inputs, enabling a stricter measure of information leakage risks. Furthermore, we introduce a Contrastive Embedding Loss (CEL) to advance the performance of LLMU. CEL employs contrastive learning to distance latent representations of unlearned samples from adaptively clustered contrast samples while aligning them with random vectors, leading to improved efficacy and robustness for LLMU. Our experiments show that FPE uncovers more unlearned information leakage risks than prior evaluation methods, and CEL improves unlearning effectiveness by at least 50.1{\%} and robustness by at least 37.2{\%} on Llama-2-7B while retaining high model utility."
}Markdown (Informal)
[A Fully Probabilistic Perspective on Large Language Model Unlearning: Evaluation and Optimization](https://preview.aclanthology.org/ingest-emnlp/2025.emnlp-main.452/) (Cheng et al., EMNLP 2025)
ACL