@inproceedings{leang-etal-2025-theorem,
title = "Theorem Prover as a Judge for Synthetic Data Generation",
author = "Leang, Joshua Ong Jun and
Hong, Giwon and
Li, Wenda and
Cohen, Shay B",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingestion-acl-25/2025.acl-long.1448/",
pages = "29941--29977",
ISBN = "979-8-89176-251-0",
abstract = "The demand for synthetic data in mathematical reasoning has increased due to its potential to enhance the mathematical capabilities of large language models (LLMs). However, ensuring the validity of intermediate reasoning steps remains a significant challenge, affecting data quality. While formal verification via theorem provers effectively validates LLM reasoning, the autoformalisation of mathematical proofs remains error-prone. In response, we introduce *iterative autoformalisation*, an approach that iteratively refines theorem prover formalisation to mitigate errors, thereby increasing the execution rate on the Lean prover from 60{\%} to 87{\%}. Building upon that, we introduce *Theorem Prover as a Judge (TP-as-a-Judge)*, a method that employs theorem prover formalisation to rigorously assess LLM intermediate reasoning, effectively integrating autoformalisation with synthetic data generation. Finally, we present *Reinforcement Learning from Theorem Prover Feedback (RLTPF),* a framework that replaces human annotation with theorem prover feedback in Reinforcement Learning from Human Feedback (RLHF). Across multiple LLMs, applying *TP-as-a-Judge* and *RLTPF* improves benchmarks with only 3,508 samples, achieving 5.56{\%} accuracy gain on Mistral-7B for MultiArith, 6.00{\%} on Llama-2-7B for SVAMP, and 3.55{\%} on Llama-3.1-8B for AQUA."
}
Markdown (Informal)
[Theorem Prover as a Judge for Synthetic Data Generation](https://preview.aclanthology.org/ingestion-acl-25/2025.acl-long.1448/) (Leang et al., ACL 2025)
ACL
- Joshua Ong Jun Leang, Giwon Hong, Wenda Li, and Shay B Cohen. 2025. Theorem Prover as a Judge for Synthetic Data Generation. In Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pages 29941–29977, Vienna, Austria. Association for Computational Linguistics.