@inproceedings{wu-etal-2025-unveiling-multimodal,
title = "Unveiling Multimodal Processing: Exploring Activation Patterns in Multimodal {LLM}s for Interpretability and Efficiency",
author = "Wu, Chuan and
Su, Meng and
Fang, Youxuan and
Zhu, Shaolin",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2025",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/author-page-yu-wang-polytechnic/2025.findings-emnlp.478/",
doi = "10.18653/v1/2025.findings-emnlp.478",
pages = "9005--9016",
ISBN = "979-8-89176-335-7",
abstract = "Recent Multimodal Large Language Models (MLLMs) have achieved remarkable advancements, yet their internal mechanisms for concurrently processing diverse modalities like text, image, and audio remain largely opaque. In this paper, we propose a methodology to convert dense MLLMs into fine-grained Mixture-of-Experts (MoE) architectures. This allows us to visually investigate their multimodal activation patterns through expert activation frequency heatmaps. Conducting comprehensive experiments on representative MLLMs, we analyze the similarities and differences in internal neuron activations when handling distinct modalities. Specifically, we examine the distribution of high-frequency activated experts, the distinct roles of high-frequency (e.g., fundamental logic) and low-frequency (e.g., domain-specific concepts) multimodal shared experts, and the prevalence and localization of modality-specific experts. Furthermore, we explore leveraging these discovered activation discrepancies to guide sparse activation and model pruning. Experimental results demonstrate that our approach substantially outperforms random expert pruning and can achieve comparable or even superior performance to the original unpruned models while utilizing significantly fewer active parameters. Our work not only sheds light on the multimodal processing mechanisms within MLLMs but also provides a practical pathway toward developing more interpretable and efficient multimodal systems."
}Markdown (Informal)
[Unveiling Multimodal Processing: Exploring Activation Patterns in Multimodal LLMs for Interpretability and Efficiency](https://preview.aclanthology.org/author-page-yu-wang-polytechnic/2025.findings-emnlp.478/) (Wu et al., Findings 2025)
ACL