@inproceedings{ma-etal-2026-towards,
title = "Towards Unified Multimodal Large Language Models: A survey",
author = "Ma, Xu and
Zhang, Yitian and
Fu, Yun",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {ACL} 2026",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl/2026.findings-acl.1853/",
pages = "37212--37230",
ISBN = "979-8-89176-395-1",
abstract = "The recent surge of interest in unified Multimodal Large Language Models (MLLMs) has catalyzed rapid progress toward general-purpose generation and understanding across different modalities. Despite the remarkable advancements, the field lacks a systematic and cohesive framework that connects these developments, revisits the motivations, and situates current trends within a broader landscape. In this survey, we present a comprehensive and in-depth review of unified MLLMs, offering both a methodology taxonomy and unique perspectives on the field. We begin by outlining the foundational concepts and prerequisites for understanding unified MLLMs. We then delve into designs from different aspects, including model architectures, loss functions, alignment techniques, and different representation strategies. Furthermore, we discuss persistent challenges and identify promising directions for future research. By bridging scattered progress and providing a consolidated view, this survey aims to foster a deeper and systematical understanding of unified MLLMs and inspire future innovations in building truly general multimodal intelligence."
}Markdown (Informal)
[Towards Unified Multimodal Large Language Models: A survey](https://preview.aclanthology.org/ingest-acl/2026.findings-acl.1853/) (Ma et al., Findings 2026)
ACL