@inproceedings{lee-etal-2026-fol,
title = "{FOL}-Traces: Verified First-Order Logic Reasoning Traces at Scale",
author = "Lee, Isabelle and
Liaw, Sarah and
Yogatama, Dani",
editor = "Demberg, Vera and
Inui, Kentaro and
Marquez, Llu{\'i}s",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {EACL} 2026",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-eacl/2026.findings-eacl.115/",
pages = "2181--2203",
ISBN = "979-8-89176-386-9",
abstract = "Reasoning in language models is difficult to evaluate: natural-language traces are unverifiable, symbolic datasets are too small, and most benchmarks conflate heuristics with inference. We present FOL-Traces, the first large-scale dataset of programmatically verified reasoning traces, enabling rigorous evaluation of structured logical inference. We also propose two challenging and comprehensive diagnostic tasks{---}masked operation prediction and step completion{---}that directly probe syntactic awareness and process fidelity. FOL-Traces serves as a scalable testbed for rigorously studying how models perform structured logical inference. Systematic experiments with 5 reasoning LLMs show that the dataset remains challenging: models only reach around 45.7{\%} accuracy on masked operation prediction and around 27{\%} on two-step completion."
}Markdown (Informal)
[FOL-Traces: Verified First-Order Logic Reasoning Traces at Scale](https://preview.aclanthology.org/ingest-eacl/2026.findings-eacl.115/) (Lee et al., Findings 2026)
ACL