@inproceedings{xing-etal-2025-benchmarking,
title = "Benchmarking and Improving {LVLM}s on Event Extraction from Multimedia Documents",
author = "Xing, Fuyu and
Wang, Zimu and
Wang, Wei and
Zhang, Haiyang",
editor = "Flek, Lucie and
Narayan, Shashi and
Phương, L{\^e} Hồng and
Pei, Jiahuan",
booktitle = "Proceedings of the 18th International Natural Language Generation Conference",
month = oct,
year = "2025",
address = "Hanoi, Vietnam",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/author-page-lei-gao-usc/2025.inlg-main.42/",
pages = "734--742",
abstract = "The proliferation of multimedia content necessitates the development of effective Multimedia Event Extraction (M{\texttwosuperior}E{\texttwosuperior}) systems. Though Large Vision-Language Models (LVLMs) have shown strong cross-modal capabilities, their utility in the M{\texttwosuperior}E{\texttwosuperior} task remains underexplored. In this paper, we present the first systematic evaluation of representative LVLMs, including DeepSeek-VL2 and the Qwen-VL series, on the M{\texttwosuperior}E{\texttwosuperior} dataset. Our evaluations cover text-only, image-only, and cross-media subtasks, assessed under both few-shot prompting and fine-tuning settings. Our key findings highlight the following valuable insights: (1) Few-shot LVLMs perform notably better on visual tasks but struggle significantly with textual tasks; (2) Fine-tuning LVLMs with LoRA substantially enhances model performance; and (3) LVLMs exhibit strong synergy when combining modalities, achieving superior performance in cross-modal settings. We further provide a detailed error analysis to reveal persistent challenges in areas such as semantic precision, localization, and cross-modal grounding, which remain critical obstacles for advancing M{\texttwosuperior}E{\texttwosuperior} capabilities."
}Markdown (Informal)
[Benchmarking and Improving LVLMs on Event Extraction from Multimedia Documents](https://preview.aclanthology.org/author-page-lei-gao-usc/2025.inlg-main.42/) (Xing et al., INLG 2025)
ACL