@inproceedings{ram-2025-citation,
title = "Citation Drift: Measuring Reference Stability in Multi-Turn {LLM} Conversations",
author = "Ram, Gokul Srinath Seetha",
editor = "Accomazzi, Alberto and
Ghosal, Tirthankar and
Grezes, Felix and
Lockhart, Kelly",
booktitle = "Proceedings of the Third Workshop for Artificial Intelligence for Scientific Publications",
month = dec,
year = "2025",
address = "Mumbai, India and virtual",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-ijcnlp-aacl/2025.wasp-main.20/",
pages = "186--191",
ISBN = "979-8-89176-310-4",
abstract = "Large Language Models (LLMs) are increasingly used for scientific writing and research assistance, yet their ability to maintain consistent citations across multi-turn conversations remains unexplored. This paper introduces the concept of citation drift{---}the phenomenon where references mutate, disappear, or get fabricated during extended LLM interactions. We analyze 240 conversations across four LLaMA models using 36 authentic scientific papers from six domains and find significant citation instability. LLaMA-4-Maverick-17B achieves the highest stability (0.481) and lowest fabrication entropy, while LLaMA-4-Scout-17B fabricates up to 85.6{\%} of citations. We introduce five new metrics{---}stability, fabrication rate, drift rate, drift entropy, and willingness-to-cite{---}providing a standardized framework for evaluating factual reliability in scientific dialogue systems. Our benchmark offers reproducible, model-agnostic evaluation tools for assessing citation reliability in AI-assisted research workflows."
}Markdown (Informal)
[Citation Drift: Measuring Reference Stability in Multi-Turn LLM Conversations](https://preview.aclanthology.org/ingest-ijcnlp-aacl/2025.wasp-main.20/) (Ram, WASP 2025)
ACL