@inproceedings{fatahi-bayat-etal-2025-factbench,
title = "{F}act{B}ench: A Dynamic Benchmark for In-the-Wild Language Model Factuality Evaluation",
author = "Fatahi Bayat, Farima and
Zhang, Lechen and
Munir, Sheza and
Wang, Lu",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingestion-acl-25/2025.acl-long.1587/",
pages = "33090--33110",
ISBN = "979-8-89176-251-0",
abstract = "The rapid adoption of language models (LMs) across diverse applications has raised concerns about their factuality, i.e., their consistency with real-world facts. We introduce VERIFY, an evidence-based evaluation pipeline that measures LMs' factuality in real-world user interactions. VERIFY considers the verifiability of LM-generated content and categorizes content units as Supported, Unsupported, or Undecidable based on Web-retrieved evidence. Importantly, factuality judgment by VERIFY more strongly correlates with human evaluations than existing methods. Using VERIFY, we identify ``hallucination prompts,'' i.e., those that frequently elicit factual errors in LM responses. These prompts form FactBench, a dataset of 1K prompts spanning 150 topics and tiered into Easy, Moderate, and Hard prompts. We benchmark widely-used openweight and proprietary LMs from six families, yielding three key findings: (i) LMs' factual precision declines from Easy to Hard prompts, (ii) factuality does not necessarily improve with scale; Llama3.1-405B-Instruct performs comparably to or worse than its 70B variant, and (iii) Gemini1.5-Pro shows a notably higher refusal rate, with over-refusal in 25{\%} of cases."
}
Markdown (Informal)
[FactBench: A Dynamic Benchmark for In-the-Wild Language Model Factuality Evaluation](https://preview.aclanthology.org/ingestion-acl-25/2025.acl-long.1587/) (Fatahi Bayat et al., ACL 2025)
ACL