@inproceedings{zhang-2025-roles,
title = "Roles of {MLLM}s in Visually Rich Document Retrieval for {RAG}: A Survey",
author = "Zhang, Xiantao",
editor = "Inui, Kentaro and
Sakti, Sakriani and
Wang, Haofen and
Wong, Derek F. and
Bhattacharyya, Pushpak and
Banerjee, Biplab and
Ekbal, Asif and
Chakraborty, Tanmoy and
Singh, Dhirendra Pratap",
booktitle = "Proceedings of the 14th International Joint Conference on Natural Language Processing and the 4th Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics",
month = dec,
year = "2025",
address = "Mumbai, India",
publisher = "The Asian Federation of Natural Language Processing and The Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-ijcnlp-aacl/2025.ijcnlp-long.2/",
pages = "19--36",
ISBN = "979-8-89176-298-5",
abstract = "Visually rich documents (VRDs) challenge retrieval-augmented generation (RAG) with layout-dependent semantics, brittle OCR, and evidence spread across complex figures and structured tables. This survey examines how Multimodal Large Language Models (MLLMs) are being used to make VRD retrieval practical for RAG. We organize the literature into three roles: *Modality-Unifying Captioners*, *Multimodal Embedders*, and *End-to-End Representers*. We compare these roles along retrieval granularity, information fidelity, latency and index size, and compatibility with reranking and grounding. We also outline key trade-offs and offer some practical guidance on when to favor each role.Finally, we identify promising directions for future research, including adaptive retrieval units, model size reduction, and the development of evaluation methods."
}Markdown (Informal)
[Roles of MLLMs in Visually Rich Document Retrieval for RAG: A Survey](https://preview.aclanthology.org/ingest-ijcnlp-aacl/2025.ijcnlp-long.2/) (Zhang, IJCNLP-AACL 2025)
ACL
- Xiantao Zhang. 2025. Roles of MLLMs in Visually Rich Document Retrieval for RAG: A Survey. In Proceedings of the 14th International Joint Conference on Natural Language Processing and the 4th Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics, pages 19–36, Mumbai, India. The Asian Federation of Natural Language Processing and The Association for Computational Linguistics.