@inproceedings{yang-etal-2026-visual,
title = "Visual Self-Fulfilling Alignment: Shaping Safety-Oriented Personas via Threat-Related Images",
author = "Yang, Qishun and
Yang, Shu and
Hu, Lijie and
Wang, Di",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Proceedings of the 64th Annual Meeting of the {A}ssociation for {C}omputational {L}inguistics (Volume 1: Long Papers)",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl/2026.acl-long.490/",
pages = "10698--10718",
ISBN = "979-8-89176-390-6",
abstract = "Multimodal large language models (MLLMs) face safety misalignment where visual inputs enable harmful outputs. Existing methods require explicit safety labels or contrastive data, yet threat-related concepts are concrete and visually depictable, while safety concepts like helpfulness are abstract and lack visual referents. Inspired by self-fulfilling mechanism underlying emergent misalignment, we propose Visual Self-Fulfilling Alignment (VSFA). VSFA fine-tunes vision-language models (VLMs) on neutral VQA tasks constructed around threat-related images, without any safety labels. Through repeated exposure to threat-related visual content, models internalize implicit semantics of vigilance and caution, shaping safety-oriented personas. Experiments across multiple VLMs and safety benchmarks demonstrate that VSFA reduces attack success rate, improves response quality, and mitigates over-refusal while preserving general capabilities. Our work extends self-fulfilling mechanism from text to visual modalities, offering a label-free approach to VLMs alignment."
}Markdown (Informal)
[Visual Self-Fulfilling Alignment: Shaping Safety-Oriented Personas via Threat-Related Images](https://preview.aclanthology.org/ingest-acl/2026.acl-long.490/) (Yang et al., ACL 2026)
ACL