@inproceedings{yu-etal-2025-humaneval,
title = "{H}uman{E}val Pro and {MBPP} Pro: Evaluating Large Language Models on Self-invoking Code Generation Task",
author = "Yu, Zhaojian and
Zhao, Yilun and
Cohan, Arman and
Zhang, Xiao-Ping",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2025",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/display_plenaries/2025.findings-acl.686/",
pages = "13253--13279",
ISBN = "979-8-89176-256-5",
abstract = "In this paper, we present HumanEval Pro and MBPP Pro, a series of benchmarks to evaluate LLMs on self-invoking code generation task. This task involves providing LLMs with a base problem alongside a related, more complex problem. The models must solve the base problem and leverage its solution to address the more complex one, thereby showcasing their capacity for progressive reasoning and problem-solving. This work features three key contributions. First, we propose a general recipe for generating more challenging versions of existing benchmarks. Second, from the analysis of experimental results over twenty large language models (LLM) on our benchmarks, we have two important observations: (i) Most LLMs excel in traditional code generation benchmarks like HumanEval and MBPP, but their performance declines on self-invoking tasks. For example, o1-mini achieves 96.2{\%} pass@1 on HumanEval but only 76.2{\%} on HumanEval Pro. (ii) On self-invoking code generation task, the instruction-tuned models demonstrate only marginal improvements compared to the base models. Third, we disclose the types of failure modes that exist in our evaluation results. All these results underscore the need for further advancements in this area and provide a new prospective to future research."
}
Markdown (Informal)
[HumanEval Pro and MBPP Pro: Evaluating Large Language Models on Self-invoking Code Generation Task](https://preview.aclanthology.org/display_plenaries/2025.findings-acl.686/) (Yu et al., Findings 2025)
ACL