@inproceedings{garani-2026-systematic,
title = "A Systematic Taxonomy of Failure Modes in Retrieval-Augmented Generation Systems",
author = "Garani, Anupama",
editor = "Chang, Kai-Wei and
Mehrabi, Ninareh and
Krishna, Satyapriya and
Das, Anubrata and
Dhamala, Jwala and
Cao, Yang Trista and
Kumarage, Tharindu and
Ramakrishna, Anil and
Christodoulopoulos, Christos and
Wan, Yixin and
Galystan, Aram and
Kumar, Anoop and
Gupta, Rahul",
booktitle = "Proceedings of the 6th Workshop on Trustworthy {NLP} ({T}rust{NLP} 2026)",
month = jul,
year = "2026",
address = "San Diego, California",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl-workshops/2026.trustnlp-main.27/",
pages = "413--424",
ISBN = "979-8-89176-418-7",
abstract = "Retrieval-Augmented Generation (RAG) systems fail in diverse, poorly characterized ways that single-stage evaluation metrics cannot detect. We present a systematic taxonomy of 33 failure modes across 7 pipeline stages {---} ingestion, representation, retrieval, generation, evaluation, deployment, and agentic orchestration {---} constructed through a structured literature review of 48 sources spanning peer-reviewed publications and high-impact preprints. For each mode, we provide a formal definition, observable manifestation, and three-level evidence grading (Strong/Moderate/Limited). Our analysis reveals a critical asymmetry in research attention: retrieval and generation failures are comparatively well-studied, while representation, evaluation, and agentic orchestration failures remain under-investigated despite frequent occurrence in production. We identify 12 failure modes with no dedicated peer-reviewed empirical evidence {---} all 8 agentic modes among them {---} constituting an evidence desert in the fastest-growing RAG deployment paradigm. Compared to prior work enumerating 7 failure points (Barnett et al., 2024) or 16 error types within partial pipeline runs (Cresswell et al., 2025), our taxonomy uniquely spans the full pipeline, including agentic orchestration with explicit evidence-level grading."
}