@inproceedings{demirhan-zadrozny-2026-uncc,
title = "{UNCC} at {M}ed{G}en{V}id{QA} 2026: Structured Temporal Grounding for Medical Video Question Answering",
author = "Demirhan, Hilmi and
Zadrozny, Wlodek",
editor = "Gupta, Deepak and
Demner-Fushman, Dina",
booktitle = "Proceedings of the {B}io{NLP} 2026 (Shared Tasks)",
month = jul,
year = "2026",
address = "San Diego, California, USA",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl-workshops/2026.bionlp-2.35/",
pages = "262--269",
ISBN = "979-8-89176-435-4",
abstract = "MedGenVidQA 2026 Task C evaluates visualanswer localization in medical videos. Thesystem receives a video and a question, then returns the start and end time of the visual answer.Our framework used timestamped automaticspeech recognition (ASR) as a proposal sourcerather than as a final boundary label. The framework generated transcript tables, phase maps,lexical and dense candidate windows, schemaconstrained ranking inputs, selective key-framechecks, and a deterministic validation pass forthe final JSON file. The ranker selected amongbounded candidate intervals instead of generating arbitrary timestamps over a full transcript.Each output can be traced to segment identifiers, candidate source families, selected anchors, phase labels, and validation flags. Ourbest run ranked fifth among six participant systems, with 62.50 IoU@0.3, 36.25 IoU@0.5,22.50 IoU@0.7, and 42.57 mIoU. The threshold pattern suggests that coarse temporal retrieval was more reliable than strict start-endlocalization."
}Markdown (Informal)
[UNCC at MedGenVidQA 2026: Structured Temporal Grounding for Medical Video Question Answering](https://preview.aclanthology.org/ingest-acl-workshops/2026.bionlp-2.35/) (Demirhan & Zadrozny, BioNLP 2026)
ACL