@inproceedings{wan-etal-2025-risk,
title = "On the Risk of Evidence Pollution for Malicious Social Text Detection in the Era of {LLM}s",
author = "Wan, Herun and
Luo, Minnan and
Su, Zhixiong and
Dai, Guang and
Zhao, Xiang",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingestion-acl-25/2025.acl-long.480/",
pages = "9731--9761",
ISBN = "979-8-89176-251-0",
abstract = "Evidence-enhanced detectors present remarkable abilities in identifying malicious social text. However, the rise of large language models (LLMs) brings potential risks of evidence pollution to confuse detectors. This paper explores potential manipulation scenarios including basic pollution, and rephrasing or generating evidence by LLMs. To mitigate the negative impact, we propose three defense strategies from the data and model sides, including machine-generated text detection, a mixture of experts, and parameter updating. Extensive experiments on four malicious social text detection tasks with ten datasets illustrate that evidence pollution significantly compromises detectors, where the generating strategy causes up to a 14.4{\%} performance drop. Meanwhile, the defense strategies could mitigate evidence pollution, but they faced limitations for practical employment. Further analysis illustrates that polluted evidence (i) is of high quality, evaluated by metrics and humans; (ii) would compromise the model calibration, increasing expected calibration error up to 21.6{\%}; and (iii) could be integrated to amplify the negative impact, especially for encoder-based LMs, where the accuracy drops by 21.8{\%}."
}
Markdown (Informal)
[On the Risk of Evidence Pollution for Malicious Social Text Detection in the Era of LLMs](https://preview.aclanthology.org/ingestion-acl-25/2025.acl-long.480/) (Wan et al., ACL 2025)
ACL