@inproceedings{zhou-etal-2026-vector,
title = "Vector Calligrapher: Generating Scalable Vector Graphics via Structured Linguistic Supervision",
author = "Zhou, Bo and
Chen, Xikang and
Gong, Yan and
Zhang, Yin",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Proceedings of the 64th Annual Meeting of the {A}ssociation for {C}omputational {L}inguistics (Volume 1: Long Papers)",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl/2026.acl-long.511/",
pages = "11152--11168",
ISBN = "979-8-89176-390-6",
abstract = "Generating SVG-based fonts requires Multimodal Large Language Models (MLLMs) to translate high-level linguistic intent into low-level, topologically constrained symbolic sequences. However, current approaches struggle with two fundamental misalignments: the semantic ambiguity of unstructured natural language for precise geometric control, and the inefficiency of generic text tokenizers, which fragment coordinate-dense SVG XML into excessively long sequences with low information density. In this work, we propose Vector Calligrapher, a system that treats SVG generation as a conditional language modeling task optimized for both semantic grounding and representational efficiency.To bridge the semantic gap, we introduce a structured linguistic supervision Font Description Framework that decomposes typographic style into interpretable linguistic dimensions (e.g., historical lineage, affective metaphors), providing structured supervision aligned with the compositional syntax of SVG. To address the tokenization bottleneck, we design a scalable separated-coordinate strategy that bypasses the vocabulary explosion of flattened tokens while significantly compressing sequence length. Supported by VectorFont, a dataset of over 10 million hierarchically annotated glyphs, our approach improves CLIP score by +23{\%}, reduces geometric error by {\ensuremath{\approx}}48{\%}, and boosts generation efficiency by achieving an 18{\%} Command-per-Token (C/T) ratio{---}a 6{\texttimes} increase in information density over standard baselines. These results demonstrate that combining structured linguistic supervision with efficient symbolic tokenization is essential for reliable, controllable vector graphics synthesis. VectorFont dataset, Code and model weights will be publicly released."
}Markdown (Informal)
[Vector Calligrapher: Generating Scalable Vector Graphics via Structured Linguistic Supervision](https://preview.aclanthology.org/ingest-acl/2026.acl-long.511/) (Zhou et al., ACL 2026)
ACL