@inproceedings{zhang-etal-2026-mavis,
title = "{MAVIS}: Multi-Agent Video Retrieval via Structured Video Understanding",
author = "Zhang, Jie and
Ye, Qilang and
Zhou, Hao and
Liang, Haochen and
Luo, Fei",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {ACL} 2026",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl/2026.findings-acl.1094/",
pages = "21751--21764",
ISBN = "979-8-89176-395-1",
abstract = "The dominant paradigm in video retrieval relies on embedding-based full-corpus scanning, which suffers from inherent computational inefficiency and the semantic asymmetry between information-dense videos and sparse textual queries. To bridge this gap, we introduce **MAVIS**, a novel multi-agent framework that rethinks retrieval as cooperative reasoning rather than brute-force search. MAVIS first bridges the granularity mismatch by parsing raw videos into a **Structured Semantic Library**, enabling explicit attribute-level indexing. During retrieval, a planner decomposes complex user intents into atomic sub-tasks, dispatching specialized agents to independently nominate candidates. Crucially, MAVIS employs a **Logic-aware Debate** mechanism with a strict veto protocol, where agents collaboratively prune logical mismatches to identify a compact set of ``controversial'' candidates for fine-grained verification. This agentic workflow effectively bypasses the inefficiency of full-library traversal. Extensive experiments on MSR-VTT, MSVD, and ActivityNet demonstrate that MAVIS achieves competitive performance without task-specific fine-tuning, offering a scalable and interpretable alternative to traditional dual-encoder approaches."
}Markdown (Informal)
[MAVIS: Multi-Agent Video Retrieval via Structured Video Understanding](https://preview.aclanthology.org/ingest-acl/2026.findings-acl.1094/) (Zhang et al., Findings 2026)
ACL