@inproceedings{poznanski-etal-2026-olmocr,
title = "The olm{OCR} Project: Building Fully Open {OCR} using {VLM}s",
author = "Poznanski, Jake and
Lo, Kyle and
Soldaini, Luca",
editor = "Durrett, Greg and
Jian, Ping",
booktitle = "Proceedings of the 64th Annual Meeting of the {A}ssociation for {C}omputational {L}inguistics (Volume 3: System Demonstrations)",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl/2026.acl-demo.62/",
pages = "626--635",
ISBN = "979-8-89176-392-0",
abstract = "We present olmOCR, a fully open OCR system developed through iterative public releases and community feedback. The system combines a 7B vision-language model trained in two stages: supervised finetuning on 260K diverse PDF pages, followed by reinforcement learning with visual unit tests over synthetic documents. Visual unit tests are binary checks of structural fidelity, including tables and equations, and serve both as an interpretable evaluation framework and as direct optimization targets. We also introduce olmOCR-Bench, a benchmark of 1.4K challenging PDFs evaluated via visual unit tests, on which olmOCR achieves state-of-the-art performance among open systems and proprietary APIs at a fraction of the cost. We have deployed olmOCR at scale to 100M+ PDFs to curate pretraining data for Olmo 3. We share lessons from our open development process and release all models, data, and code across two major releases."
}Markdown (Informal)
[The olmOCR Project: Building Fully Open OCR using VLMs](https://preview.aclanthology.org/ingest-acl/2026.acl-demo.62/) (Poznanski et al., ACL 2026)
ACL
- Jake Poznanski, Kyle Lo, and Luca Soldaini. 2026. The olmOCR Project: Building Fully Open OCR using VLMs. In Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (Volume 3: System Demonstrations), pages 626–635, San Diego, California, United States. Association for Computational Linguistics.