@inproceedings{wang-etal-2026-safety,
title = "{SAME}: Safety-Aware Model Editing Guided by Safety Transformation",
author = "Wang, Jiayi and
Wang, Shipeng and
Wu, Ji and
Sun, Jian",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Proceedings of the 64th Annual Meeting of the {A}ssociation for {C}omputational {L}inguistics (Volume 1: Long Papers)",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl/2026.acl-long.1632/",
pages = "35324--35343",
ISBN = "979-8-89176-390-6",
abstract = "Editing large language models is challenging as incorporating new knowledge often requires sequential parameter updates while maintaining model capability. In this work, we experimentally observe that sequential knowledge updating under the locate-then-edit framework can introduce safety risks, regardless of whether the knowledge being edited is benign or malicious. We propose a novel model editing approach that estimates safety transforms and identifies corresponding safety direction in the neural activation space, and then aligns neural activation updates and network parameter updates under the safety constraints, resulting in a safety-aware model editing approach. We evaluate our approach on open-source LLMs, Llama-3-8B-Instruct, Qwen3-4B-Instruct and Qwen2.5-14B-Instruct, using the benchmark datasets ZsRE and COUNTERFACT, as well as the malicious dataset Mal-KSet. Experimental results demonstrate that our approach effectively reduces unsafe responses to malicious queries while preserving the effectiveness of model editing."
}Markdown (Informal)
[SAME: Safety-Aware Model Editing Guided by Safety Transformation](https://preview.aclanthology.org/ingest-acl/2026.acl-long.1632/) (Wang et al., ACL 2026)
ACL
- Jiayi Wang, Shipeng Wang, Ji Wu, and Jian Sun. 2026. SAME: Safety-Aware Model Editing Guided by Safety Transformation. In Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pages 35324–35343, San Diego, California, United States. Association for Computational Linguistics.