@article{lim-etal-2026-vg,
title = "{VG}-{C}o{T}: Towards Trustworthy Visual Reasoning via Grounded Chain-of-Thought",
author = "Lim, Byeonggeuk and
Kim, Kyeonghyun and
Yun, Jungmin and
Kim, Youngbin",
editor = "Piperidis, Stelios and
Bel, N{\'u}ria and
van den Heuvel, Henk and
Ide, Nancy and
Krek, Simon and
Toral, Antonio",
journal = "International Conference on Language Resources and Evaluation",
volume = "main",
month = may,
year = "2026",
address = "Palma de Mallorca, Spain",
publisher = "ELRA Language Resource Association",
url = "https://preview.aclanthology.org/ingest-lrec/2026.lrec-main.867/",
pages = "11108--11118",
abstract = "The advancement of Large Vision-Language Models (LVLMs) requires precise local region-based reasoning that faithfully grounds the model{'}s logic in actual visual evidence. However, existing datasets face limitations in scalability due to extensive manual annotation and lack explicit alignment between multi-step reasoning and corresponding image regions, which constrains the evaluation of model trustworthiness. To address these challenges, we propose the Visual Grounding Chain-of-Thought (VG-CoT) dataset, which explicitly links each reasoning step to real visual evidence within the image through a fully automated three-stage pipeline. The pipeline first extracts object- and text-level visual evidence using state-of-the-art detection and OCR models, then generates step-by-step grounded reasoning with GPT-4o, and finally refines the grounding through a rationale-driven open-set detection process. In addition, we introduce a new benchmark that comprehensively evaluates LVLMs reasoning across three complementary dimensions: Rationale Quality, Answer Accuracy, and Reasoning{--}Answer Alignment. Experiments with representative LVLMs, including LLaVA-1.5 and Qwen2-VL, demonstrate consistent improvements across all evaluation metrics, confirming that VG-CoT effectively enhances trustworthy, evidence-based reasoning while maintaining scalable and cost-efficient dataset construction. The dataset and code will be released publicly upon acceptance to facilitate further research."
}Markdown (Informal)
[VG-CoT: Towards Trustworthy Visual Reasoning via Grounded Chain-of-Thought](https://preview.aclanthology.org/ingest-lrec/2026.lrec-main.867/) (Lim et al., LREC 2026)
ACL