@inproceedings{berrayana-etal-2025-bias,
title = "Are Bias Evaluation Methods Biased ?",
author = "Berrayana, Lina and
Rooney, Sean and
Garc{\'e}s-Erice, Luis and
Giurgiu, Ioana",
editor = "Arviv, Ofir and
Clinciu, Miruna and
Dhole, Kaustubh and
Dror, Rotem and
Gehrmann, Sebastian and
Habba, Eliya and
Itzhak, Itay and
Mille, Simon and
Perlitz, Yotam and
Santus, Enrico and
Sedoc, Jo{\~a}o and
Shmueli Scheuer, Michal and
Stanovsky, Gabriel and
Tafjord, Oyvind",
booktitle = "Proceedings of the Fourth Workshop on Generation, Evaluation and Metrics (GEM{\texttwosuperior})",
month = jul,
year = "2025",
address = "Vienna, Austria and virtual meeting",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/nschneid-patch-1/2025.gem-1.22/",
pages = "249--261",
ISBN = "979-8-89176-261-9",
abstract = "The creation of benchmarksto evaluate the safety of Large Language Models is one of the key activities within the trusted AI community. These benchmarks allow models to be compared for different aspects of safety such as toxicity, bias, harmful behavior etc. Independent benchmarks adopt different approacheswith distinct data sets and evaluation methods. We investigate how robust such benchmarks are by using different approachesto rank a set of representative models for bias andcompare how similar are the overall rankings. We show that different but widely used bias evaluations methods result in disparate model rankings. We conclude with recommendations for the community in the usage of such benchmarks."
}
Markdown (Informal)
[Are Bias Evaluation Methods Biased ?](https://preview.aclanthology.org/nschneid-patch-1/2025.gem-1.22/) (Berrayana et al., GEM 2025)
ACL
- Lina Berrayana, Sean Rooney, Luis Garcés-Erice, and Ioana Giurgiu. 2025. Are Bias Evaluation Methods Biased ?. In Proceedings of the Fourth Workshop on Generation, Evaluation and Metrics (GEM²), pages 249–261, Vienna, Austria and virtual meeting. Association for Computational Linguistics.