@inproceedings{hu-etal-2026-vapo,
title = "{VAPO}: End-to-end Slide-Enhanced Speech Recognition with Omni-modal Large Language Models",
author = "Hu, Rui and
Qiu, Delai and
Wang, Yining and
Liu, Shengping and
Sang, Jitao",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Proceedings of the 64th Annual Meeting of the {A}ssociation for {C}omputational {L}inguistics (Volume 1: Long Papers)",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl/2026.acl-long.425/",
pages = "9417--9432",
ISBN = "979-8-89176-390-6",
abstract = "Omni-modal large language models (OLLMs) offer a promising end-to-end solution for slide-enhanced speech recognition due to their inherent multimodal capabilities. However, we found a fundamental issue faced by OLLMs: \textit{Visual Interference}, where models show a bias towards visible text over auditory signals, causing them to hallucinate slide content that was never spoken. To address this, we propose Visually-Anchored Policy Optimization (VAPO), which aims to reshape models' inference process to follow the human-like ``Look-then-Listen'' inference chain. Specifically, we design a temporally decoupled policy: the model first extracts visual priors in a think{\ensuremath{>}} block to serve as semantic anchors, then generates the transcription in an answer{\ensuremath{>}} block. The policy is optimized via multi-objective reinforcement learning. Furthermore, we introduce SlideASR-Bench, a comprehensive benchmark designed to address the scarcity of entity-rich data, comprising a large-scale synthetic corpus for training and a challenging real-world test set for evaluation. We conduct extensive evaluations demonstrating that VAPO effectively eliminates visual interference and achieves state-of-the-art performance on SlideASR-Bench and public datasets, significantly reducing entity recognition errors in specialized domains."
}Markdown (Informal)
[VAPO: End-to-end Slide-Enhanced Speech Recognition with Omni-modal Large Language Models](https://preview.aclanthology.org/ingest-acl/2026.acl-long.425/) (Hu et al., ACL 2026)
ACL