@inproceedings{mule-etal-2026-teaching,
title = "Teaching Language Models to Forecast Research Success Through Comparative Idea Evaluation",
author = "Mule, Srujan P and
Garikaparthi, Aniketh and
Patwardhan, Manasi",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {ACL} 2026",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl-workshops/2026.findings-acl.1918/",
pages = "38491--38529",
ISBN = "979-8-89176-395-1",
abstract = "As language models accelerate scientific research by automating hypothesis generation and implementation, a new bottleneck emerges: evaluating and filtering hundreds of AI-generated ideas without exhaustive experimentation. We ask whether LMs can learn to forecast the empirical success of research ideas before any experiments are run. We study \textit{comparative empirical forecasting}: given a benchmark-specific research goal and two candidate ideas, predict which will achieve better benchmark performance. We construct a dataset of 11,488 idea pairs grounded in objective outcomes from PapersWithCode. While off-the-shelf 8B-parameter models struggle (30{\%} acc.), SFT dramatically boosts performance to 77.1{\%}, outperforming GPT-5 (61.1{\%}). By framing evaluation as a reasoning task via Reinforcement Learning with Verifiable Rewards (RLVR), we train models to discover latent reasoning paths, achieving 71.35{\%} acc. with interpretable justifications. Through additional ablations and out-of-distribution tests, we show robustness to surface-level heuristics and transfer to both a cross-domain time-split test set and an independently constructed test set. Our results demonstrate that compute-efficient small language models can serve as effective, objective verifiers, offering a scalable path for autonomous scientific discovery."
}Markdown (Informal)
[Teaching Language Models to Forecast Research Success Through Comparative Idea Evaluation](https://preview.aclanthology.org/ingest-acl-workshops/2026.findings-acl.1918/) (Mule et al., Findings 2026)
ACL