@inproceedings{chen-etal-2025-sdd,
title = "{SDD}: Self-Degraded Defense against Malicious Fine-tuning",
author = "Chen, ZiXuan and
Lu, Weikai and
Lin, Xin and
Zeng, Ziqian",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingestion-acl-25/2025.acl-long.1412/",
pages = "29109--29125",
ISBN = "979-8-89176-251-0",
abstract = "Open-source Large Language Models (LLMs) often employ safety alignment methods to resist harmful instructions. However, recent research shows that maliciously fine-tuning these LLMs on harmful data can easily bypass these safeguards. To counter this, we theoretically uncover why malicious fine-tuning succeeds and identify potential defense strategies. Building on the theoretical analysis, we introduce the Self-Degraded Defense (SDD) framework. SDD encourages LLMs to produce high-quality but irrelevant responses to harmful prompts. When attackers attempt malicious fine-tuning, the general capability of the LLM aligned by SDD will significantly decrease, rendering it incapable of following harmful instructions. Our experimental results confirm SDD{'}s effectiveness against such attacks.Our code is available at \url{https://github.com/ZeroNLP/SDD}."
}
Markdown (Informal)
[SDD: Self-Degraded Defense against Malicious Fine-tuning](https://preview.aclanthology.org/ingestion-acl-25/2025.acl-long.1412/) (Chen et al., ACL 2025)
ACL
- ZiXuan Chen, Weikai Lu, Xin Lin, and Ziqian Zeng. 2025. SDD: Self-Degraded Defense against Malicious Fine-tuning. In Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pages 29109–29125, Vienna, Austria. Association for Computational Linguistics.