@inproceedings{vasisht-etal-2025-knowledge,
title = "Knowledge Graph Guided Evaluation of Abstention Techniques",
author = "Vasisht, Kinshuk and
Kaur, Navreet and
Pruthi, Danish",
editor = "Chiruzzo, Luis and
Ritter, Alan and
Wang, Lu",
booktitle = "Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)",
month = apr,
year = "2025",
address = "Albuquerque, New Mexico",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/Ingest-2025-COMPUTEL/2025.naacl-long.353/",
pages = "6921--6939",
ISBN = "979-8-89176-189-6",
abstract = "To deploy language models safely, it is crucial that they abstain from responding to inappropriate requests. Several prior studies test the safety promises of models based on their effectiveness in blocking malicious requests. In this work, we focus on evaluating the underlying techniques that cause models to abstain. We create {\textquoteleft}SELECT{\textquoteleft}, a benchmark derived from a set of benign concepts (e.g., {\textquotedblleft}rivers{\textquotedblright}) from a knowledge graph. Focusing on benign concepts isolates the effect of safety training, and grounding these concepts in a knowledge graph allows us to study the *generalization* and *specificity* of abstention techniques. Using {\textquoteleft}SELECT{\textquoteleft}, we benchmark different abstention techniques over six open-weight and closed-source models. We find that the examined techniques indeed cause models to abstain with over 80{\%} abstention rates. However, these techniques are not as effective for descendants of the target concepts, where abstention rates drop by 19{\%}. We also characterize the generalization-specificity trade-offs for different techniques. Overall, no single technique is invariably better than others, and our findings inform practitioners of the various trade-offs involved."
}
Markdown (Informal)
[Knowledge Graph Guided Evaluation of Abstention Techniques](https://preview.aclanthology.org/Ingest-2025-COMPUTEL/2025.naacl-long.353/) (Vasisht et al., NAACL 2025)
ACL
- Kinshuk Vasisht, Navreet Kaur, and Danish Pruthi. 2025. Knowledge Graph Guided Evaluation of Abstention Techniques. In Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers), pages 6921–6939, Albuquerque, New Mexico. Association for Computational Linguistics.