@inproceedings{islam-rochan-2025-adaptmerge,
title = "{A}dapt{M}erge: Inference Time Adaptive Visual and Language-Guided Token Merging for Efficient Large Multimodal Models",
author = "Islam, Zahidul and
Rochan, Mrigank",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2025",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/author-page-yu-wang-polytechnic/2025.findings-emnlp.387/",
doi = "10.18653/v1/2025.findings-emnlp.387",
pages = "7352--7361",
ISBN = "979-8-89176-335-7",
abstract = "Recent advances in Large Multimodal Models (LMMs) have showcased impressive visual understanding and vision-language reasoning capabilities, yet their computational cost hinders practical deployment, especially in resource-constrained settings. A key bottleneck is the large number of visual tokens generated by its vision encoders, which increases latency and memory demands. Existing token reduction methods often require costly fine-tuning or apply fixed token reduction ratios, ignoring image complexity and vision-language interactions. We propose AdaptMerge, a training-free, inference-time token merging strategy that adaptively reduces visual tokens by leveraging feature diversity and language-guided relevance. By dynamically adjusting to image complexity and ensuring multimodal coherence, AdaptMerge significantly lowers floating-point operations while improving performance. Extensive experiments on Google{'}s latest Gemma 3 models (4B and 12B parameters) across four challenging benchmarks demonstrate that AdaptMerge outperforms state-of-the-art token reduction techniques, achieving both reduced computational costs and improved performance, thereby providing a practical pathway to more efficient LMMs."
}Markdown (Informal)
[AdaptMerge: Inference Time Adaptive Visual and Language-Guided Token Merging for Efficient Large Multimodal Models](https://preview.aclanthology.org/author-page-yu-wang-polytechnic/2025.findings-emnlp.387/) (Islam & Rochan, Findings 2025)
ACL