@inproceedings{parvez-2025-chain,
title = "Chain of Evidences and Evidence to Generate: Prompting for Context Grounded and Retrieval Augmented Reasoning",
author = "Parvez, Md Rizwan",
editor = "Shi, Weijia and
Yu, Wenhao and
Asai, Akari and
Jiang, Meng and
Durrett, Greg and
Hajishirzi, Hannaneh and
Zettlemoyer, Luke",
booktitle = "Proceedings of the 4th International Workshop on Knowledge-Augmented Methods for Natural Language Processing",
month = may,
year = "2025",
address = "Albuquerque, New Mexico, USA",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/fix-sig-urls/2025.knowledgenlp-1.21/",
pages = "230--245",
ISBN = "979-8-89176-229-9",
abstract = "While chain-of-thoughts (CoT) prompting has revolutionized how LLMs perform reasoning tasks, its current methods and variations (e.g, Self-consistency, ReACT, Reflexion, Tree-of-Thoughts (ToT), Cumulative Reasoning (CR) etc.,) suffer from limitations like limited context grounding, hallucination/inconsistent output generation, and iterative sluggishness. To overcome these challenges, we introduce a novel mono/dual-step zero-shot prompting framework built upon two unique strategies \textbf{Chain of Evidences (CoE)} and \textbf{Evidence to Generate (E2G)}. Instead of unverified reasoning claims, our innovative approaches leverage the power of ``evidence for decision making'' by first focusing exclusively on the thought sequences explicitly mentioned in the context which then serve as extracted evidence, guiding the LLM{'}s output generation process with greater precision and efficiency. This simple yet potent approach unlocks the full potential of chain-of-thoughts prompting, facilitating faster, more reliable, and contextually aware reasoning in LLMs. Our framework consistently achieves remarkable results across various knowledge-intensive reasoning and generation tasks, surpassing baseline approaches with state-of-the-art LLMs. For instance, (i) on the LogiQA benchmark using GPT-4, CoE achieves a new state-of-the-art accuracy of 53.8{\%}, surpassing CoT by 18{\%}, ToT by 11{\%}, and CR by 9{\%}; (ii) CoE with PaLM-2 outperforms the variable-shot performance of Gemini Ultra by 0.9 F1 points, achieving an F1 score of 83.3 on DROP. We release our prompts and outputs on these benchmarks as a new instruction tuning dataset for future research at \textit{Hugging Face}."
}