@inproceedings{xie-etal-2025-mafmo,
title = "{MAFMO}: Multi-modal Adaptive Fusion with Meta-template Optimization for Vision-Language Models",
author = "Xie, Mingrui and
Xu, Lulu and
Du, Junliang",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2025",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/name-variant-enfa-fane/2025.findings-emnlp.953/",
doi = "10.18653/v1/2025.findings-emnlp.953",
pages = "17576--17585",
ISBN = "979-8-89176-335-7",
abstract = "Vision-language models like CLIP demonstrate exceptional generalization capabilities but face significant adaptation challenges due to parameter scale, prompt sensitivity, and cross-modal alignment difficulties. Existing approaches primarily focus on single-modality adjustments, leading to suboptimal alignment and limited generalization. We introduce MAFMO, a plug-and-play framework comprising: (1) a Harmonic Cross-Modal Adapter enabling efficient cross-modal knowledge transfer; (2) a Meta-Template Optimization module dynamically generating input-dependent templates; and (3) a Cross-Modal Knowledge Synthesis mechanism preserving critical structural relationships during adaptation. Extensive experiments across multiple fine-grained visual recognition benchmarks demonstrate MAFMO consistently improves existing methods' performance on both novel classes and harmonic mean, while maintaining robustness under various challenging conditions with minimal computational overhead."
}Markdown (Informal)
[MAFMO: Multi-modal Adaptive Fusion with Meta-template Optimization for Vision-Language Models](https://preview.aclanthology.org/name-variant-enfa-fane/2025.findings-emnlp.953/) (Xie et al., Findings 2025)
ACL