@inproceedings{gloria-silva-etal-2026-vigia,
title = "{VIG}i{A}: Instructional Video Guidance via Dialogue Reasoning and Retrieval",
author = "Gl{\'o}ria-Silva, Diogo and
Semedo, David and
Magalhaes, Joao",
editor = "Demberg, Vera and
Inui, Kentaro and
Marquez, Llu{\'i}s",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {EACL} 2026",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-eacl/2026.findings-eacl.208/",
pages = "4003--4030",
ISBN = "979-8-89176-386-9",
abstract = "We introduce VIGiA, a novel multimodal dialogue model designed to understand and reason over complex, multi-step instructional video action plans. Unlike prior work which focuses mainly on text-only guidance, or treats vision and language in isolation, VIGiA supports grounded, plan-aware dialogue that requires reasoning over visual inputs, instructional plans, and interleaved user interactions. To this end, VIGiA incorporates two key capabilities: (1) multimodal plan reasoning, enabling the model to align uni- and multimodal queries with the current task plan and respond accurately; and (2) plan-based retrieval, allowing it to retrieve relevant plan steps in either textual or visual representations. Experiments were done on a novel dataset with rich Instructional Video Dialogues aligned with Cooking and DIY plans. Our evaluation shows that VIGiA outperforms existing state-of-the-art models on all tasks in a conversational plan guidance setting, reaching over 90{\%} accuracy on plan-aware VQA."
}Markdown (Informal)
[VIGiA: Instructional Video Guidance via Dialogue Reasoning and Retrieval](https://preview.aclanthology.org/ingest-eacl/2026.findings-eacl.208/) (Glória-Silva et al., Findings 2026)
ACL