@inproceedings{li-etal-2025-piguard,
title = "{PIG}uard: Prompt Injection Guardrail via Mitigating Overdefense for Free",
author = "Li, Hao and
Liu, Xiaogeng and
Zhang, Ning and
Xiao, Chaowei",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingestion-acl-25/2025.acl-long.1468/",
pages = "30420--30437",
ISBN = "979-8-89176-251-0",
abstract = "Prompt injection attacks pose a critical threat to large language models (LLMs), enabling goal hijacking and data leakage. Prompt guard models, though effective in defense, suffer from over-defense{---}falsely flagging benign inputs as malicious due to trigger word bias. To address this issue, we introduce NotInject, an evaluation dataset that systematically measures over-defense across various prompt guard models. NotInject contains 339 benign samples enriched with trigger words common in prompt injection attacks, enabling fine-grained evaluation. Our results show that state-of-the-art models suffer from over-defense issues, with accuracy dropping close to random guessing levels (60{\%}). To mitigate this, we propose PIGuard, a novel prompt guard model that incorporates a new training strategy, Mitigating Over-defense for Free (MOF), which significantly reduces the bias on trigger words. PIGuard demonstrates state-of-the-art performance on diverse benchmarks including NotInject, surpassing the existing best model by 30.4{\%}, offering a robust and open-source solution for detecting prompt injection attacks. The code and datasets are released at https://github.com/leolee99/PIGuard."
}
Markdown (Informal)
[PIGuard: Prompt Injection Guardrail via Mitigating Overdefense for Free](https://preview.aclanthology.org/ingestion-acl-25/2025.acl-long.1468/) (Li et al., ACL 2025)
ACL