@article{basile-etal-2026-many,
title = "How Many Samples Do We Need? A Toolkit for Power-Aware Evaluation Design",
author = "Basile, Angelo and
Sarvazyan, Areg Mikael and
Gonz{\'a}lez, Jos{\'e} {\'A}ngel",
editor = "Piperidis, Stelios and
Bel, N{\'u}ria and
van den Heuvel, Henk and
Ide, Nancy and
Krek, Simon and
Toral, Antonio",
journal = "International Conference on Language Resources and Evaluation",
volume = "main",
month = may,
year = "2026",
address = "Palma de Mallorca, Spain",
publisher = "ELRA Language Resource Association",
url = "https://preview.aclanthology.org/ingest-lrec/2026.lrec-main.353/",
pages = "4507--4513",
abstract = "If datasets are the telescopes of our field, then statistical power is their resolution, i.e., their ability to reveal a true difference in model performance when one exists. Many NLP evaluations are underpowered, leading to overstated claims of improvement. This paper introduces sk-power, an open-source Python library that helps researchers and practitioners design well-powered evaluations. Built with familiar scikit-learn-style abstractions, sk-power enables users to simulate evaluation scenarios, estimate minimum detectable effects, and assess the reliability of reported gains. We also illustrate what can go wrong when power analysis isn{'}t carried out. Our goal is to position power analysis as a first-class, practical step in evaluation planning."
}Markdown (Informal)
[How Many Samples Do We Need? A Toolkit for Power-Aware Evaluation Design](https://preview.aclanthology.org/ingest-lrec/2026.lrec-main.353/) (Basile et al., LREC 2026)
ACL