@inproceedings{ustalov-2025-reliable,
title = "Reliable, Reproducible, and Really Fast Leaderboards with Evalica",
author = "Ustalov, Dmitry",
editor = "Rambow, Owen and
Wanner, Leo and
Apidianaki, Marianna and
Al-Khalifa, Hend and
Eugenio, Barbara Di and
Schockaert, Steven and
Mather, Brodie and
Dras, Mark",
booktitle = "Proceedings of the 31st International Conference on Computational Linguistics: System Demonstrations",
month = jan,
year = "2025",
address = "Abu Dhabi, UAE",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/fix-sig-urls/2025.coling-demos.6/",
pages = "46--53",
abstract = "The rapid advancement of natural language processing (NLP) technologies, such as instruction-tuned large language models (LLMs), urges the development of modern evaluation protocols with human and machine feedback. We introduce Evalica, an open-source toolkit that facilitates the creation of reliable and reproducible model leaderboards. This paper presents its design, evaluates its performance, and demonstrates its usability through its Web interface, command-line interface, and Python API."
}
Markdown (Informal)
[Reliable, Reproducible, and Really Fast Leaderboards with Evalica](https://preview.aclanthology.org/fix-sig-urls/2025.coling-demos.6/) (Ustalov, COLING 2025)
ACL