@inproceedings{li-yang-2026-njust,
title = "{NJUST}-{KMG} at {M}ed{G}en{V}id{QA} 2026: Cascade Multi-modal Alignment with {G}aussian Soft Priors for Medical Visual Answer Localization",
author = "Li, Jinglong and
Yang, Yang",
editor = "Gupta, Deepak and
Demner-Fushman, Dina",
booktitle = "Proceedings of the {B}io{NLP} 2026 (Shared Tasks)",
month = jul,
year = "2026",
address = "San Diego, California, USA",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl-workshops/2026.bionlp-2.30/",
pages = "229--232",
ISBN = "979-8-89176-435-4",
abstract = "This paper describes the system developed for the Medical Visual Answer Localization (MVAL) task at MedGenVidQA 2026. Accurately locating surgical or instructional steps in medical videos is inherently challenging due to audio-visual asynchrony and the visual homogeneity of surgical scenes. We propose a Cascade Multi-modal Alignment Framework that integrates Large Language Models (LLMs) to bridge the semantic-temporal gap. Our pipeline utilizes WhisperX for word-level speech transcription to ensure precise textual anchoring. We then employ Gemini3 as a high-level semantic ranker to generate multi-scale textual priors. Crucially, we transform these discrete semantic scores into a continuous 1D Gaussian Soft Prior, which is injected as an attention bias into our cross-modal fusion network. This mechanism preserves global temporal context while guiding the model to focus on query-relevant frames. Our system achieves highly competitive performance on the validation leaderboard, particularly under strict evaluation metrics, reaching an IoU@0.7 of 67.5{\%}."
}Markdown (Informal)
[NJUST-KMG at MedGenVidQA 2026: Cascade Multi-modal Alignment with Gaussian Soft Priors for Medical Visual Answer Localization](https://preview.aclanthology.org/ingest-acl-workshops/2026.bionlp-2.30/) (Li & Yang, BioNLP 2026)
ACL