@inproceedings{huang-etal-2026-llms,
title = "How Do {LLM}s ``Trust'' Unknown Knowledge? An Unknown Knowledge Based Jailbreak Attack",
author = "Huang, Yixiao and
Zhang, Lan and
Wang, Chaoran",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {ACL} 2026",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingestion-form-platform/2026.findings-acl.1849/",
pages = "37105--37124",
ISBN = "979-8-89176-395-1",
abstract = "Learning unknown knowledge through ICL and RAG can enhance LLM capabilities in specialized fields. While most research focuses on how to identify and utilize such knowledge, little work examines what factors lead LLMs to trust and adopt it, leaving models prone to errors and harmful content. Grounded in extensive pre-experiments, we design five pairs of trust-enhancing and trust-diminishing transformations on unknown knowledge to experimentally identify the key trust factors. These findings are further substantiated through a detailed theoretical analysis grounded in the epistemological framework of evidentialism. Based on these insights, we challengingly propose a completely unrestricted and fully randomized jailbreak attack that embeds malicious queries within trust-enhanced unknown knowledge. In both defended and undefended scenarios, our method achieves 99{\%} to 100{\%} ASR on all tested LLMs, including the latest GPT-5.1, and becomes SOTA. This attack confirms the trust mechanism and exposes a critical and hard-to-defend security risk. Our conclusions provide valuable guidance for understanding trust mechanism of unknown knowledge and for future research."
}Markdown (Informal)
[How Do LLMs "Trust" Unknown Knowledge? An Unknown Knowledge Based Jailbreak Attack](https://preview.aclanthology.org/ingestion-form-platform/2026.findings-acl.1849/) (Huang et al., Findings 2026)
ACL