@inproceedings{wang-etal-2025-exploring,
title = "Exploring Limitations of {LLM} Capabilities with Multi-Problem Evaluation",
author = "Wang, Zhengxiang and
Kodner, Jordan and
Rambow, Owen",
editor = "Drozd, Aleksandr and
Sedoc, Jo{\~a}o and
Tafreshi, Shabnam and
Akula, Arjun and
Shu, Raphael",
booktitle = "The Sixth Workshop on Insights from Negative Results in NLP",
month = may,
year = "2025",
address = "Albuquerque, New Mexico",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/fix-sig-urls/2025.insights-1.12/",
pages = "121--140",
ISBN = "979-8-89176-240-4",
abstract = "We propose using prompts made up of multiple problems to evaluate LLM capabilities, an approach we call multi-problem evaluation. We examine 7 LLMs on 4 related task types constructed from 6 existing classification benchmarks. We find that while LLMs can generally perform multiple homogeneous classifications at once (Batch Classification) as well as when they do so separately, they perform significantly worse on two selection tasks that are conceptually equivalent to Batch Classification and involve selecting indices of text falling into each class label, either independently or altogether. We show that such a significant performance drop is due to LLMs' inability to adequately combine index selection with text classification. Such a drop is surprisingly observed across all LLMs attested, under zero-shot, few-shot, and CoT settings, and even with a novel synthetic dataset, potentially reflecting an inherent capability limitation with modern LLMs."
}
Markdown (Informal)
[Exploring Limitations of LLM Capabilities with Multi-Problem Evaluation](https://preview.aclanthology.org/fix-sig-urls/2025.insights-1.12/) (Wang et al., insights 2025)
ACL