@inproceedings{ponnuraj-2026-document,
title = "Document Overlap Is Not Evidence Continuity: Measuring Retrieval Jitter in Citation-Based {RAG} Evaluation",
author = "Ponnuraj, Punitha",
editor = "Akhtar, Mubashara and
Batzner, Jan and
Choshen, Leshem and
Ghosh, Avijit and
Gohar, Usman and
Mickel, Jennifer and
Pant, Ichhya and
Talat, Zeerak and
Lin, Michelle",
booktitle = "Proceedings of the Workshop on Evaluating Evaluations ({E}val{E}val)",
month = jul,
year = "2026",
address = "San Diego, CA",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl-workshops/2026.evaleval-1.35/",
pages = "221--226",
ISBN = "979-8-89176-429-3",
abstract = "RAG evaluations often rely on citations or retrieved evidence traces for correctness checks, provenance claims, and audits, implicitly assuming that evidence remains reproducible under routine retrieval settings. We test this assumption in a controlled diagnostic study where queries, embeddings, and decoding are fixed while retrieval depth, chunk size, and overlap vary. We call the resulting change in attributed evidence retrieval jitter and measure evidence identity at two levels: document (doc{\_}id) and exact cited span (doc{\_}id, span{\_}hash). Across BEIR ArguAna and SciFact, we observe a consistent Stability Gap: document overlap remains moderate while span overlap often collapses, including many cases of total span turnover despite non-empty retrieval. We interpret span-level instability as a diagnostic of exact evidence-trace reproducibility, not semantic equivalence. These findings motivate reporting stability diagnostics alongside citation-based evaluation metrics for more reproducible evaluation practice."
}Markdown (Informal)
[Document Overlap Is Not Evidence Continuity: Measuring Retrieval Jitter in Citation-Based RAG Evaluation](https://preview.aclanthology.org/ingest-acl-workshops/2026.evaleval-1.35/) (Ponnuraj, EvalEval 2026)
ACL