@inproceedings{gill-etal-2025-lost,
title = "What Has Been Lost with Synthetic Evaluation?",
author = "Gill, Alexander and
Ravichander, Abhilasha and
Marasovic, Ana",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2025",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/author-page-yu-wang-polytechnic/2025.findings-emnlp.526/",
doi = "10.18653/v1/2025.findings-emnlp.526",
pages = "9902--9945",
ISBN = "979-8-89176-335-7",
abstract = "Large language models (LLMs) are increasingly used for data generation. However, creating evaluation benchmarks raises the bar for this emerging paradigm. Benchmarks must target specific phenomena, penalize exploiting shortcuts, and be challenging. Through two case studies, we ask whether LLMs are ready to meet these demands{---}by generating reasoning-over-text benchmarks and comparing them to those that were created through careful crowdsourcing. Specifically, we evaluate both the *validity* and *difficulty* of LLM-generated versions of two high-quality reading comprehension datasets: CondaQA, which evaluates reasoning about negation, and DROP, which targets reasoning about quantities. We find that prompting LLMs can produce variants of these datasets that are often valid according to the annotation guidelines, at a fraction of the cost of the original crowdsourcing effort. However, we show that they are *less challenging for LLMs* than their human-authored counterparts. This finding sheds light on what may have been lost by generating evaluation data with LLMs, and calls for critically reassessing the immediate use of this increasingly prevalent approach to benchmark creation."
}Markdown (Informal)
[What Has Been Lost with Synthetic Evaluation?](https://preview.aclanthology.org/author-page-yu-wang-polytechnic/2025.findings-emnlp.526/) (Gill et al., Findings 2025)
ACL
- Alexander Gill, Abhilasha Ravichander, and Ana Marasovic. 2025. What Has Been Lost with Synthetic Evaluation?. In Findings of the Association for Computational Linguistics: EMNLP 2025, pages 9902–9945, Suzhou, China. Association for Computational Linguistics.