@inproceedings{raihan-chowdhury-2026-causal,
title = "Causal Localization of the {E}nglish Pivot in {LL}a{VA}: Mechanistic {VLM} Analysis and Training-Free Multilingual Steering",
author = "Raihan, Abrar Zahin and
Chowdhury, Aurchi",
editor = "Huang, Kaiyu and
Mo, Fengran and
Chen, Pinzhen and
Jiang, Meng",
booktitle = "Proceedings of the 1st Workshop on Multilinguality in the Era of Large Language Models ({M}e{LLM} 2026)",
month = jul,
year = "2026",
address = "San Diego, United States",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl-workshops/2026.mellm-1.25/",
pages = "257--265",
ISBN = "979-8-89176-430-9",
abstract = "Multilingual vision-language models (VLMs) consistently underperform on non-English visual queries, yet the internal mechanism behind this disparity remains unknown. As a focused case study on LLaVA-1.5-7B, we apply logit-lens analysis and causal activation patching to show that non-English visual queries are routed through an English-biased representational bottleneck in layers 5{--}17, extending the English-pivot phenomenon of Wendler et al. (2024) to the multimodal setting. Peak causal influence occurs at layer 8 ($\overline{\text{AIE}} = 0.49$, averaged across languages), with all measurable pivot signal running through text-token positions. Without meaningful visual content (blank-image condition), language-specific representations do not emerge at any layer, showing that the pivot is image-content-dependent rather than triggered by any visual input. Building on these findings, we derive training-free language-steering vectors at the mechanistically identified pivot layers, improving Russian VQA by +6.5 pp and Portuguese by +4.0 pp on MMMB without any fine-tuning {---} the latter surpassing the English baseline. Within this case study, our results are consistent with the English pivot being a structural property of the LLM backbone that multimodal pre-training does not mitigate; extending this mechanistic methodology to other VLMs and language families remains an important direction for future work."
}Markdown (Informal)
[Causal Localization of the English Pivot in LLaVA: Mechanistic VLM Analysis and Training-Free Multilingual Steering](https://preview.aclanthology.org/ingest-acl-workshops/2026.mellm-1.25/) (Raihan & Chowdhury, MeLLM 2026)
ACL