@inproceedings{wang-etal-2025-evaluating,
title = "Evaluating {LLM}s with Multiple Problems at once",
author = "Wang, Zhengxiang and
Kodner, Jordan and
Rambow, Owen",
editor = "Arviv, Ofir and
Clinciu, Miruna and
Dhole, Kaustubh and
Dror, Rotem and
Gehrmann, Sebastian and
Habba, Eliya and
Itzhak, Itay and
Mille, Simon and
Perlitz, Yotam and
Santus, Enrico and
Sedoc, Jo{\~a}o and
Shmueli Scheuer, Michal and
Stanovsky, Gabriel and
Tafjord, Oyvind",
booktitle = "Proceedings of the Fourth Workshop on Generation, Evaluation and Metrics (GEM{\texttwosuperior})",
month = jul,
year = "2025",
address = "Vienna, Austria and virtual meeting",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/metadata-correction-jian-chen-ub/2025.gem-1.14/",
pages = "178--199",
ISBN = "979-8-89176-261-9",
abstract = "This paper shows the benefits and fruitfulness of evaluating LLMs with multiple problems at once, a paradigm we call multi-problem evaluation (MPE). Unlike conventional single-problem evaluation, where a prompt presents a single problem and expects one specific answer, MPE places multiple problems together in a single prompt and assesses how well an LLM answers all these problems in a single output. Leveraging 6 classification and 12 reasoning benchmarks that already exist, we introduce a new benchmark called ZeMPE (Zero-shot Multi-Problem Evaluation), comprising 53,100 zero-shot multi-problem prompts. We experiment with a total of 13 LLMs from 5 model families on ZeMPE to present a comprehensive and systematic MPE. Our results show that LLMs are capable of handling multiple problems from a single data source as well as handling them separately, but there are conditions this multiple problem handling capability falls short. In addition, we perform in-depth further analyses and explore model-level factors that may enable multiple problem handling capabilities in LLMs. We release our corpus and code to facilitate future research."
}
Markdown (Informal)
[Evaluating LLMs with Multiple Problems at once](https://preview.aclanthology.org/metadata-correction-jian-chen-ub/2025.gem-1.14/) (Wang et al., GEM 2025)
ACL
- Zhengxiang Wang, Jordan Kodner, and Owen Rambow. 2025. Evaluating LLMs with Multiple Problems at once. In Proceedings of the Fourth Workshop on Generation, Evaluation and Metrics (GEM²), pages 178–199, Vienna, Austria and virtual meeting. Association for Computational Linguistics.