@inproceedings{tint-2025-prood,
title = "{PROOD}: A Simple {LLM} Out-of-Distribution Guardrail Leveraging Response Semantics",
author = "Tint, Joshua",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2025",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/author-page-yu-wang-polytechnic/2025.findings-emnlp.1272/",
doi = "10.18653/v1/2025.findings-emnlp.1272",
pages = "23428--23438",
ISBN = "979-8-89176-335-7",
abstract = "Out-of-distribution (OOD) detection is a key safeguard for large language models, especially when they{'}re deployed in real-world applications. However, existing OOD methods often struggle with prompts that are deliberately obfuscated, context-dependent, or superficially benign{---}making it hard to distinguish between harmless queries and adversarial or dangerous ones. These methods typically assess prompts in isolation, missing important semantic cues from the model{'}s response. We introduce PROOD, prompt-response OOD detection, a framework that jointly analyzes LLM prompts *and their corresponding outputs* to improve semantic understanding. PROOD supports zero-shot multiclass detection using synthetic data generation and it offers a tunable probabilistic classification output. We validate PROOD on three challenging benchmarks{---}TrustLLM, OR-Bench, and AdvBench{---}where consistently outperforms prior OOD techniques, improving F1 scores by up to 6.3 points, from 0.871 to 0.934. Our results show that incorporating model responses enables more accurate, context-aware OOD detection in complex and adversarial prompt environments."
}Markdown (Informal)
[PROOD: A Simple LLM Out-of-Distribution Guardrail Leveraging Response Semantics](https://preview.aclanthology.org/author-page-yu-wang-polytechnic/2025.findings-emnlp.1272/) (Tint, Findings 2025)
ACL