@inproceedings{chia-etal-2024-instructeval,
title = "{I}nstruct{E}val: Towards Holistic Evaluation of Instruction-Tuned Large Language Models",
author = "Chia, Yew Ken and
Hong, Pengfei and
Bing, Lidong and
Poria, Soujanya",
editor = "Miceli-Barone, Antonio Valerio and
Barez, Fazl and
Cohen, Shay and
Voita, Elena and
Germann, Ulrich and
Lukasik, Michal",
booktitle = "Proceedings of the First edition of the Workshop on the Scaling Behavior of Large Language Models (SCALE-LLM 2024)",
month = mar,
year = "2024",
address = "St. Julian{'}s, Malta",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/fix-sig-urls/2024.scalellm-1.4/",
pages = "35--64",
abstract = "Instruction-tuned large language models have revolutionized natural language processing and have shown great potential in applications such as conversational agents. These models, such as GPT-4, can not only master language but also solve complex tasks in areas like mathematics, coding, medicine, and law. However, there is still a lack of comprehensive understanding regarding their full potential, primarily due to the black-box nature of many models and lack of holistic evaluation. To address these challenges, we present InstructEval, a more comprehensive evaluation suite designed specifically for instruction-tuned large language models. Unlike previous works, our evaluation involves a rigorous assessment of models based on problem-solving, writing ability, and alignment to human values. We take a holistic approach to analyze various factors affecting model performance, including the pretraining foundation, instruction-tuning data, and training methods. Our findings reveal that the quality of instruction data is a crucial factor in scaling model performance. While open-source models demonstrate impressive writing abilities, there is substantial room for improvement in problem-solving and alignment."
}
Markdown (Informal)
[InstructEval: Towards Holistic Evaluation of Instruction-Tuned Large Language Models](https://preview.aclanthology.org/fix-sig-urls/2024.scalellm-1.4/) (Chia et al., SCALE-LLM 2024)
ACL