@inproceedings{zhang-etal-2025-vision,
title = "Vision-aided Unsupervised Constituency Parsing with Multi-{MLLM} Debating",
author = "Zhang, Dong and
Tian, Haiyan and
Sun, Qingying and
Li, Shoushan",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2025",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/transition-to-people-yaml/2025.findings-acl.353/",
doi = "10.18653/v1/2025.findings-acl.353",
pages = "6800--6810",
ISBN = "979-8-89176-256-5",
abstract = "This paper presents a novel framework for vision-aided unsupervised constituency parsing (VUCP), leveraging multimodal large language models (MLLMs) pre-trained on diverse image-text or video-text data. Unlike previous methods requiring explicit cross-modal alignment, our approach eliminates this need by using pre-trained models like Qwen-VL and VideoLLaVA, which seamlessly handle multimodal inputs. We introduce two multi-agent debating mechanisms{---}consensus-driven (CD) and round-driven (RD){---}to enable cooperation between models with complementary strengths. Extensive experiments demonstrate that our approach achieves state-of-the-art performance on both image-text and video-text datasets for VUCP, improving robustness and accuracy."
}
Markdown (Informal)
[Vision-aided Unsupervised Constituency Parsing with Multi-MLLM Debating](https://preview.aclanthology.org/transition-to-people-yaml/2025.findings-acl.353/) (Zhang et al., Findings 2025)
ACL