@inproceedings{sylvestre-2026-gold,
title = "Gold Label Errors in the {S}ci{F}act Benchmark: An {LLM}-Assisted Annotation Audit",
author = "Sylvestre, Julien",
editor = "Demner-Fushman, Dina and
Ananiadou, Sophia and
Roberts, Kirk and
Tsujii, Junichi",
booktitle = "{B}io{NLP} 2026",
month = jul,
year = "2026",
address = "San Diego, California",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl-workshops/2026.bionlp-1.9/",
pages = "97--103",
ISBN = "979-8-89176-434-7",
abstract = "SciFact is a widely-used benchmark for scientific claim verification (645 citations, included in the BEIR evaluation suite). We present, to our knowledge, the first systematic annotation audit of its development and training sets, combining automated screening with a small language model ({\$}0.11 in API fees) and exhaustive manual verification against source publications. We identify 11 gold-label errors in the development set (5.3{\%}, 95{\%} CI 2.7?9.2{\%}, of 209 audited claim?document pairs) and 13 in the training set (2.3{\%}, 95{\%} CI 1.2?3.9{\%}, of 564 audited pairs). The dev errors exhibit a directional asymmetry?9 of 11 mislabel a claim as SUPPORT (one-sided binomial p=0.033, two-sided p=0.065)?and fall into four recurring types. Correcting the dev labels raises binary macro-F1 by 1.7?3.8 points across GPT-5.4 (mini, nano) and Claude Haiku 4.5; gains are larger in 3-way evaluation when mislabeled evidence is recast as NEI (e.g., +9.2 with Haiku 4.5). The binary range is comparable in magnitude to inter-system margins on the SciFact leaderboard. A simple claim-only probe with Haiku 4.5 does not support label memorization as the main explanation for these gains. We release corrected annotations and a blind annotator packet, and recommend that benchmark users prefer the corrected release going forward."
}Markdown (Informal)
[Gold Label Errors in the SciFact Benchmark: An LLM-Assisted Annotation Audit](https://preview.aclanthology.org/ingest-acl-workshops/2026.bionlp-1.9/) (Sylvestre, BioNLP 2026)
ACL