@inproceedings{wang-etal-2026-texocr,
title = "{T}ex{OCR}: Advancing Document {OCR} Models for Compilable Page-to-{L}a{T}e{X} Reconstruction",
author = "Wang, Chengye and
Fu, Lin and
Kuang, Zexi and
Zhao, Yilun",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Proceedings of the 64th Annual Meeting of the {A}ssociation for {C}omputational {L}inguistics (Volume 1: Long Papers)",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl/2026.acl-long.1658/",
pages = "35821--35845",
ISBN = "979-8-89176-390-6",
abstract = "Existing document OCR largely targets plain text or Markdown, discarding the structural and executable properties that make LaTeX essential for scientific publishing. We study page-level reconstruction of scientific PDFs into compilable LaTeX and introduce TexOCR-Bench, a benchmark, and TexOCR-Train, a large-scale training corpus, for this task. TexOCR-Bench features a multi-dimensional evaluation suite that jointly assesses transcription fidelity, structural faithfulness, and end-to-end compilability. Leveraging TexOCR-Train, we train a 2B-parameter model, TexOCR, using supervised fine-tuning (SFT) and reinforcement learning (RL) with verifiable rewards derived from LaTeX unit tests that directly enforce compilability and referential integrity. Experiments across 21 frontier models on TexOCR-Bench show that existing systems frequently violate key document invariants, including consistent section structure, correct float placement, and valid label{--}reference links, which undermines compilation reliability and downstream usability. Our analysis further reveals that RL with verifiable rewards yields consistent improvements over SFT alone, particularly on structural and compilation metrics."
}Markdown (Informal)
[TexOCR: Advancing Document OCR Models for Compilable Page-to-LaTeX Reconstruction](https://preview.aclanthology.org/ingest-acl/2026.acl-long.1658/) (Wang et al., ACL 2026)
ACL