@inproceedings{hasan-2026-fragile,
title = "How Fragile Is Vision-Language Alignment? Mapping Concept Disruption Under Text-to-Image Personalization",
author = "Hasan, Mujtaba",
editor = "Yan, Qianqi and
Montariol, Syrielle and
Fan, Yue and
Gu, Jing and
Pan, Jiayi and
Li, Manling and
Kordjamshidi, Parisa and
Suhr, Alane and
Wang, Xin Eric",
booktitle = "Proceedings of the 4th Workshop on Advances in Language and Vision Research ({ALVR})",
month = jul,
year = "2026",
address = "San Diego, California, USA",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl-workshops/2026.alvr-main.27/",
pages = "278--286",
ISBN = "979-8-89176-398-2",
abstract = "Text-to-image diffusion models learn a mapping from natural language to visual structure, but how robust is this mapping to perturbation? We use personalization{---}fine-tuning a model to learn a new face, object, or style{---}as a controlled stress test to probe the fragility of learned vision-language alignment. We find that fine-tuning for one concept systematically shifts the model{'}s ability to faithfully render unrelated concepts, and that this disruption follows structured, predictable patterns. To measure this fragility, we construct Concept Entanglement Maps: per-prompt, per-model disruption matrices that reveal which concepts are most affected and why. Using Stable Diffusion v1.5 as a controlled testbed, we evaluate 15 subjects across three personalization methods on 200 prompts and report three findings about the organization of vision-language alignment: (1) aggregate disruption is larger for vision-backbone and cross-attention perturbations than for text-embedding perturbations, despite the latter directly modifying the language representation; (2) abstract and compositional language is significantly more fragile than concrete, object-specific language; and (3) disruption does not follow semantic proximity{---}personalizing for a face does not preferentially disrupt other face-related prompts ($p = 1.0$), suggesting that alignment vulnerability is organized globally rather than purely by semantic category. These findings expose a structural vulnerability in current text-to-image personalization: the same cross-attention mechanism that enables compositional generalization also creates pathways through which local fine-tuning can propagate as global alignment shift."
}Markdown (Informal)
[How Fragile Is Vision-Language Alignment? Mapping Concept Disruption Under Text-to-Image Personalization](https://preview.aclanthology.org/ingest-acl-workshops/2026.alvr-main.27/) (Hasan, ALVR 2026)
ACL