@inproceedings{yu-2025-towards,
title = "Towards Comprehensive Evaluation of Open-Source Language Models: A Multi-Dimensional, User-Driven Approach",
author = "Yu, Qingchen",
editor = "Arviv, Ofir and
Clinciu, Miruna and
Dhole, Kaustubh and
Dror, Rotem and
Gehrmann, Sebastian and
Habba, Eliya and
Itzhak, Itay and
Mille, Simon and
Perlitz, Yotam and
Santus, Enrico and
Sedoc, Jo{\~a}o and
Shmueli Scheuer, Michal and
Stanovsky, Gabriel and
Tafjord, Oyvind",
booktitle = "Proceedings of the Fourth Workshop on Generation, Evaluation and Metrics (GEM{\texttwosuperior})",
month = jul,
year = "2025",
address = "Vienna, Austria and virtual meeting",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/nschneid-patch-1/2025.gem-1.1/",
pages = "1--7",
ISBN = "979-8-89176-261-9",
abstract = "With rapid advancements in large language models (LLMs) across artificial intelligence, machine learning, and data sci-ence, there is a growing need for evaluation frameworks that go beyond traditional performance metrics. Conventional methods focus mainly on accuracy and computational metrics, often neglecting user experience and community interaction{---}key elements in open-source environments. This paper intro-duces a multi-dimensional, user-centered evaluation frame-work, integrating metrics like User Engagement Index (UEI), Community Response Rate (CRR), and a Time Weight Factor (TWF) to assess LLMs' real-world impact. Additionally, we propose an adaptive weighting mechanism using Bayesian op-timization to dynamically adjust metric weights for more ac-curate model evaluation. Experimental results confirm that our framework effectively identifies models with strong user engagement and community support, offering a balanced, data-driven approach to open-source LLM evaluation. This frame-work serves as a valuable tool for developers and researchers in selecting and improving open-source models."
}
Markdown (Informal)
[Towards Comprehensive Evaluation of Open-Source Language Models: A Multi-Dimensional, User-Driven Approach](https://preview.aclanthology.org/nschneid-patch-1/2025.gem-1.1/) (Yu, GEM 2025)
ACL