@inproceedings{zhao-etal-2026-learning,
title = "Learning Flexible Large Multimodal Models with Arbitrary Modality Combinations",
author = "Zhao, Xinyu and
Ni, Kangqi and
Peng, Jie and
Li, Ang and
Chen, Tianlong",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {ACL} 2026",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl/2026.findings-acl.517/",
pages = "10664--10678",
ISBN = "979-8-89176-395-1",
abstract = "Multimodal Large Language Models (MLLMs) show strong potential for cross-modal understanding by integrating powerful language models with multimodal encoders. However, extending MLLMs to handle a diverse range of modalities introduces two critical and intertwined challenges: (1) the reliance on fully paired multimodal data, often scarce or costly to acquire across all modalities, and (2) the computational inefficiency from processing numerous modality tokens and requiring substantial model updates for each new modality. To address these challenges, we enable MLLMs to handle missing modalities by generating representations for absent inputs. Furthermore, recognizing that an increasing number of modalities leads to linearly scaling token counts and that lengthy generated sequences can hinder performance, we employ a dual-stage compression mechanism. It first reduces the number of tokens per modality and then condenses information from multiple modalities into a single, compact token sequence. This culminates in Flex-M$^3$, a novel MLLM framework designed for flexible and efficient learning across arbitrary combinations of modalities. Experiments across diverse multimodal benchmarks and backbones demonstrate that Flex-M$^3$ robustly handles varied modality inputs and scales efficiently. Notably, Flex-M outperforms its counterpart trained on only full-modality data, with consistent improvements of {2.29{\%}, 3.15{\%}, 11.01{\%}} on multimodal reasoning tasks {NExT-QA, MUSIC-AVQA, SQA3D}. Moreover, Flex-M$^3$ demonstrates superior robustness during inference, even when a high proportion of modalities are missing from the input samples, showcasing its capacity for complex, data-scarce multimodal applications."
}Markdown (Informal)
[Learning Flexible Large Multimodal Models with Arbitrary Modality Combinations](https://preview.aclanthology.org/ingest-acl/2026.findings-acl.517/) (Zhao et al., Findings 2026)
ACL