@inproceedings{zhu-etal-2026-beyond,
title = "Beyond Atomic Characters: Glyph-Aware Sub-character Alignment for Low-Resource Multilingual {OCR}",
author = "Zhu, Mengxiao and
Chen, Haixu and
Sha, Jiu and
Liu, Jie and
Shi, Ge",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Proceedings of the 64th Annual Meeting of the {A}ssociation for {C}omputational {L}inguistics (Volume 1: Long Papers)",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl/2026.acl-long.1392/",
pages = "30169--30185",
ISBN = "979-8-89176-390-6",
abstract = "Low-resource multilingual OCR faces a dual challenge: complex script structures and severe data scarcity. In such settings, existing OCR models often struggle, as coarse visual representations combined with weak linguistic priors lead to frequent errors among visually similar characters.To address this, we present BASA (Beyond Atomic Sub-character Alignment), a OCR framework built upon high-resolution visual and language backbones with a novel glyph-aware interface. The core technical contribution is the Glyph-Aware Fine-grained Adapter (GAFA). Unlike standard linear projectors, GAFA employs learnable glyph prototypes to actively align sub-character structural primitives (e.g., strokes and radicals) with visual features, explicitly resolving topological ambiguities during vision{--}language alignment. To complement this, we introduce a two-stage curriculum learning strategy supported by a Glyph-Aware Reverse Synthesis pipeline, which generates large-scale multilingual training corpora with automatic, zero-cost component labels. Furthermore, we construct BASA-Bench, a representative benchmark spanning 11 languages with diverse script structures and 23 authentic scenarios. Experiments demonstrate that BASA achieves consistent improvements over strong OCR baselines, particularly on scripts with complex compositions. Our model and benchmark will be available at \url{https://github.com/NcutLLM/BASA}."
}Markdown (Informal)
[Beyond Atomic Characters: Glyph-Aware Sub-character Alignment for Low-Resource Multilingual OCR](https://preview.aclanthology.org/ingest-acl/2026.acl-long.1392/) (Zhu et al., ACL 2026)
ACL