@inproceedings{chen-etal-2025-exploiting,
title = "Exploiting Prompt-induced Confidence for Black-Box Attacks on {LLM}s",
author = "Chen, Meina and
Tang, Yihong and
Chen, Kehai",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2025",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/author-page-yu-wang-polytechnic/2025.findings-emnlp.692/",
doi = "10.18653/v1/2025.findings-emnlp.692",
pages = "12897--12903",
ISBN = "979-8-89176-335-7",
abstract = "Large language models (LLMs) are vulnerable to adversarial attacks even in strict black-box settings with only hard-label feedback.Existing attacks suffer from inefficient search due to lack of informative signals such as logits or probabilities. In this work, we propose Prompt-Guided Ensemble Attack (PGEA), a novel black-box framework that leverages prompt-induced confidence, which reflects variations in a model{'}s self-assessed certainty across different prompt templates, as an auxiliary signal to guide attacks. We first demonstrate that confidence estimates vary significantly with prompt phrasing despite unchanged predictions. We then integrate these confidence signals in a two-stage attack: (1) estimating token-level vulnerability via confidence elicitation, and (2) applying ensemble word-level substitutions guided by these estimates. Experiments on LLaMA-3-8B-Instruct and Mistral-7B-Instruct-v0.3 on three classification tasks show that PGEA improves the attack success rate and query efficiency while maintaining semantic fidelity. Our results highlight that verbalized confidence, even without access to probabilities, is a valuable and underexplored signal for black-box adversarial attacks. The code is available at https://github.com/cmn-bits/PGEA-main."
}Markdown (Informal)
[Exploiting Prompt-induced Confidence for Black-Box Attacks on LLMs](https://preview.aclanthology.org/author-page-yu-wang-polytechnic/2025.findings-emnlp.692/) (Chen et al., Findings 2025)
ACL