@inproceedings{chen-etal-2026-preserving,
title = "Preserving Language Capabilities in Vision-Language Models via Representation Regulation",
author = "Chen, ZiXuan and
Tao, Juncheng and
Zeng, Ziqian",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {ACL} 2026",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl/2026.findings-acl.1210/",
pages = "24189--24205",
ISBN = "979-8-89176-395-1",
abstract = "Vision-Language Models (VLMs) provide a unified framework to process both text-only tasks and vision-language tasks. However, finetuning VLMs on vision-language data has degraded language capabilities. In this paper, we prove that as the training loss declines during finetuning, the visual representation and textual representation move closer to each other, a phenomenon we term ``representation mixing.'' We prove that the representation mixing occurring within the post-representation layers causes the degradation of language capabilities. Post-representation layers refer to the first few layers in LLMs that are involved in representation learning. To preserve the language capabilities, we propose the Representation Regulation for VLM Training (RRVLM), which introduces a Representation Distribution Difference (RDD) loss to reduce the distance between these representations. Extensive experiments on various benchmarks and VLM frameworks show that our method can effectively preserve the language capabilities and achieve superior vision-language performance."
}Markdown (Informal)
[Preserving Language Capabilities in Vision-Language Models via Representation Regulation](https://preview.aclanthology.org/ingest-acl/2026.findings-acl.1210/) (Chen et al., Findings 2026)
ACL