@inproceedings{chang-etal-2025-watermark,
title = "Watermark Smoothing Attacks against Language Models",
author = "Chang, Hongyan and
Hassani, Hamed and
Shokri, Reza",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2025",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/author-page-yu-wang-polytechnic/2025.findings-emnlp.264/",
doi = "10.18653/v1/2025.findings-emnlp.264",
pages = "4915--4941",
ISBN = "979-8-89176-335-7",
abstract = "Watermarking is a key technique for detecting AI-generated text. In this work, we study its vulnerabilities and introduce the Smoothing Attack, a novel watermark removal method. By leveraging the relationship between the model{'}s confidence and watermark detectability, our attack selectively smoothes the watermarked content, erasing watermark traces while preserving text quality. We validate our attack on open-source models ranging from 1.3 B to 30B parameters on 10 different watermarks, demonstrating its effectiveness. Our findings expose critical weaknesses in existing watermarking schemes and highlight the need for stronger defenses."
}Markdown (Informal)
[Watermark Smoothing Attacks against Language Models](https://preview.aclanthology.org/author-page-yu-wang-polytechnic/2025.findings-emnlp.264/) (Chang et al., Findings 2025)
ACL