@inproceedings{yuxuan-etal-2026-multistage,
title = "A Multistage Extraction Pipeline for Long Scanned Financial Documents: An Empirical Study in Industrial {KYC} Workflows",
author = "Yuxuan, Han and
Zhang, Yuanxing and
Wang, Yushuo and
Jin, Yichao",
editor = "Li, Yunyao and
Rehm, Georg and
Tu, Mei",
booktitle = "Proceedings of the 64th Annual Meeting of the {A}ssociation for {C}omputational {L}inguistics ({ACL} 2026)",
month = jul,
year = "2026",
address = "San Diego, California, USA",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl/2026.acl-industry.99/",
pages = "1419--1433",
ISBN = "979-8-89176-394-4",
abstract = "Structured information extraction from long, multilingual scanned financial documents is a core requirement in industrial KYC and compliance workflows. These documents are typically non-machine-readable, noisy, and visually heterogeneous. They usually span dozens of pages while containing only sparse task-relevant information. Although recent vision{--}language models (VLMs) achieve strong benchmark performance, directly applying them end-to-end to full financial reports often leads to unreliable extraction under real-world conditions.We present a multistage extraction framework that integrates image preprocessing, multilingual OCR, hybrid page-level retrieval, and compact VLM-based structured extraction. The design separates page localization from multimodal reasoning, enabling more accurate extraction from complex multi-page documents.We evaluated the framework on 120 production KYC documents comprising about 3000 multilingual scanned pages. Across multiple OCR{--}VLM combinations, the proposed pipeline consistently outperforms direct PDF-to-VLM baselines, improving field-level accuracy by up to 31.9 percentage points. The best configuration, PaddleOCR with MiniCPM-o-2.6, achieves 87.27{\%} accuracy. Ablation studies show that page-level retrieval is the dominant factor in performance improvements, particularly for complex financial statements and non-English documents."
}Markdown (Informal)
[A Multistage Extraction Pipeline for Long Scanned Financial Documents: An Empirical Study in Industrial KYC Workflows](https://preview.aclanthology.org/ingest-acl/2026.acl-industry.99/) (Yuxuan et al., ACL 2026)
ACL