@inproceedings{poiroux-etal-2025-rlmeval,
title = "{RLME}val: Evaluating Research-Level Neural Theorem Proving",
author = "Poiroux, Auguste and
Bosselut, Antoine and
Kun{\v{c}}ak, Viktor",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2025",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/author-page-yu-wang-polytechnic/2025.findings-emnlp.581/",
doi = "10.18653/v1/2025.findings-emnlp.581",
pages = "10946--10957",
ISBN = "979-8-89176-335-7",
abstract = "Despite impressive results on curated benchmarks, the practical impact of large language models (LLMs) on research-level neural theorem proving and proof autoformalization is still limited. We introduce RLMEval, an evaluation suite for these tasks, focusing on research-level mathematics from real-world Lean formalization projects. RLMEval targets the evaluation of neural theorem proving and proof autoformalization on challenging research-level theorems by leveraging real Lean Blueprint formalization projects. Our evaluation of state-of-the-art models on RLMEval, comprising 613 theorems from 6 Lean projects, reveals a significant gap: progress on existing benchmarks does not readily translate to these more realistic settings, with the best model achieving only a 10.3{\%} pass rate. RLMEval provides a new, challenging benchmark designed to guide and accelerate progress in automated reasoning for formal mathematics."
}Markdown (Informal)
[RLMEval: Evaluating Research-Level Neural Theorem Proving](https://preview.aclanthology.org/author-page-yu-wang-polytechnic/2025.findings-emnlp.581/) (Poiroux et al., Findings 2025)
ACL