@inproceedings{qi-etal-2025-evaluating,
title = "Evaluating {LLM}s' Assessment of Mixed-Context Hallucination Through the Lens of Summarization",
author = "Qi, Siya and
Cao, Rui and
He, Yulan and
Yuan, Zheng",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2025",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/corrections-2025-08/2025.findings-acl.847/",
doi = "10.18653/v1/2025.findings-acl.847",
pages = "16480--16503",
ISBN = "979-8-89176-256-5",
abstract = "With the rapid development of large language models (LLMs), LLM-as-a-judge has emerged as a widely adopted approach for text quality evaluation, including hallucination evaluation. While previous studies have focused exclusively on single-context evaluation (e.g., discourse faithfulness or world factuality), real-world hallucinations typically involve mixed contexts, which remains inadequately evaluated. In this study, we use summarization as a representative task to comprehensively evaluate LLMs' capability in detecting mixed-context hallucinations, specifically distinguishing between factual and non-factual hallucinations. Through extensive experiments across direct generation and retrieval-based models of varying scales, our main observations are: (1) LLMs' intrinsic knowledge introduces inherent biases in hallucination evaluation; (2) These biases particularly impact the detection of factual hallucinations, yielding a significant performance bottleneck; and (3) the fundamental challenge lies in effective knowledge utilization, balancing between LLMs' intrinsic knowledge and external context for accurate mixed-context hallucination evaluation."
}
Markdown (Informal)
[Evaluating LLMs’ Assessment of Mixed-Context Hallucination Through the Lens of Summarization](https://preview.aclanthology.org/corrections-2025-08/2025.findings-acl.847/) (Qi et al., Findings 2025)
ACL