@inproceedings{li-etal-2025-text,
title = "Text or Pixels? Evaluating Efficiency and Understanding of {LLM}s with Visual Text Inputs",
author = "Li, Yanhong and
Lan, Zixuan and
Zhou, Jiawei",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2025",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/name-variant-enfa-fane/2025.findings-emnlp.558/",
doi = "10.18653/v1/2025.findings-emnlp.558",
pages = "10564--10578",
ISBN = "979-8-89176-335-7",
abstract = "Large language models (LLMs) and their multimodal variants can now process visual inputs, including images of text. This raises an intriguing question: Can we compress textual inputs by feeding them as images to reduce token usage while preserving performance?In this paper, we show that *visual text representations* are a practical and surprisingly effective form of input compression for decoder LLMs. We exploit this idea by rendering long text inputs as a single image and providing it directly to the model. This approach dramatically reduces the number of decoder tokens required, offering a new form of input compression. Through experiments on two distinct benchmarks {---} RULER (long-context retrieval) and CNN/DailyMail (document summarization) {---} we demonstrate that this text-as-image method yields substantial token savings *without degrading task performance*."
}Markdown (Informal)
[Text or Pixels? Evaluating Efficiency and Understanding of LLMs with Visual Text Inputs](https://preview.aclanthology.org/name-variant-enfa-fane/2025.findings-emnlp.558/) (Li et al., Findings 2025)
ACL