@inproceedings{yadav-etal-2024-pythonsaga,
title = "{P}ython{S}aga: Redefining the Benchmark to Evaluate Code Generating {LLM}s",
author = "Yadav, Ankit and
Beniwal, Himanshu and
Singh, Mayank",
editor = "Al-Onaizan, Yaser and
Bansal, Mohit and
Chen, Yun-Nung",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2024",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/add-emnlp-2024-awards/2024.findings-emnlp.996/",
doi = "10.18653/v1/2024.findings-emnlp.996",
pages = "17113--17126",
abstract = "Driven by the surge in code generation using large language models (LLMs), numerous benchmarks have emerged to evaluate these LLMs capabilities. We conducted a large-scale human evaluation of *HumanEval* and *MBPP*, two popular benchmarks for Python code generation, analyzing their diversity and difficulty. Our findings unveil a critical bias towards a limited set of programming concepts, neglecting most of the other concepts entirely. Furthermore, we uncover a worrying prevalence of easy tasks that can inflate model performance estimations. To address these limitations, we propose a novel benchmark, *PythonSaga*, featuring 185 hand-crafted prompts in a balanced representation of 38 programming concepts across diverse difficulty levels. The robustness of our benchmark is demonstrated by the poor performance of existing Code-LLMs. The code and data set are openly available to the NLP community at this [URL](https://github.com/PythonSaga/PythonSaga)."
}
Markdown (Informal)
[PythonSaga: Redefining the Benchmark to Evaluate Code Generating LLMs](https://preview.aclanthology.org/add-emnlp-2024-awards/2024.findings-emnlp.996/) (Yadav et al., Findings 2024)
ACL