@inproceedings{lu-etal-2025-merger,
title = "Merger-as-a-Stealer: Stealing Targeted {PII} from Aligned {LLM}s with Model Merging",
author = "Lu, Lin and
Zuo, Zhigang and
Sheng, Ziji and
Zhou, Pan",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-emnlp/2025.emnlp-main.295/",
pages = "5817--5836",
ISBN = "979-8-89176-332-6",
abstract = "Model merging has emerged as a promising approach for updating large language models (LLMs) by integrating multiple domain-specific models into a cross-domain merged model. Despite its utility and plug-and-play nature, unmonitored mergers can introduce significant security vulnerabilities, such as backdoor attacks and model merging abuse. In this paper, we identify a novel and more realistic attack surface where a malicious merger can extract targeted personally identifiable information (PII) from an aligned model with model merging. Specifically, we propose Merger-as-a-Stealer, a two-stage framework to achieve this attack: First, the attacker fine-tunes a malicious model to force it to respond to any PII-related queries. The attacker then uploads this malicious model to the model merging conductor and obtains the merged model. Second, the attacker inputs direct PII-related queries to the merged model to extract targeted PII. Extensive experiments demonstrate that Merger-as-a-Stealer successfully executes attacks against various LLMs and model merging methods across diverse settings, highlighting the effectiveness of the proposed framework. Given that this attack enables character-level extraction for targeted PII without requiring any additional knowledge from the attacker, we stress the necessity for improved model alignment and more robust defense mechanisms to mitigate such threats."
}Markdown (Informal)
[Merger-as-a-Stealer: Stealing Targeted PII from Aligned LLMs with Model Merging](https://preview.aclanthology.org/ingest-emnlp/2025.emnlp-main.295/) (Lu et al., EMNLP 2025)
ACL