@inproceedings{wang-etal-2025-evaluating,
title = "Evaluating {LLM}s with Multiple Problems at once",
author = "Wang, Zhengxiang and
Kodner, Jordan and
Rambow, Owen",
editor = "Dhole, Kaustubh and
Clinciu, Miruna",
booktitle = "Proceedings of the Fourth Workshop on Generation, Evaluation and Metrics (GEM{\texttwosuperior})",
month = jul,
year = "2025",
address = "Vienna, Austria and virtual meeting",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/corrections-2025-08/2025.gem-1.14/",
pages = "178--199",
ISBN = "979-8-89176-261-9",
abstract = "This paper shows the benefits and fruitfulness of evaluating LLMs with multiple problems at once, a paradigm we call multi-problem evaluation (MPE). Unlike conventional single-problem evaluation, where a prompt presents a single problem and expects one specific answer, MPE places multiple problems together in a single prompt and assesses how well an LLM answers all these problems in a single output. Leveraging 6 classification and 12 reasoning benchmarks that already exist, we introduce a new benchmark called ZeMPE (Zero-shot Multi-Problem Evaluation), comprising 53,100 zero-shot multi-problem prompts. We experiment with a total of 13 LLMs from 5 model families on ZeMPE to present a comprehensive and systematic MPE. Our results show that LLMs are capable of handling multiple problems from a single data source as well as handling them separately, but there are conditions this multiple problem handling capability falls short. In addition, we perform in-depth further analyses and explore model-level factors that may enable multiple problem handling capabilities in LLMs. We release our corpus and code to facilitate future research."
}
Markdown (Informal)
[Evaluating LLMs with Multiple Problems at once](https://preview.aclanthology.org/corrections-2025-08/2025.gem-1.14/) (Wang et al., GEM 2025)
ACL
- Zhengxiang Wang, Jordan Kodner, and Owen Rambow. 2025. Evaluating LLMs with Multiple Problems at once. In Proceedings of the Fourth Workshop on Generation, Evaluation and Metrics (GEM²), pages 178–199, Vienna, Austria and virtual meeting. Association for Computational Linguistics.