@inproceedings{tai-etal-2024-pixar,
title = "{PIXAR}: Auto-Regressive Language Modeling in Pixel Space",
author = "Tai, Yintao and
Liao, Xiyang and
Suglia, Alessandro and
Vergari, Antonio",
editor = "Ku, Lun-Wei and
Martins, Andre and
Srikumar, Vivek",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2024",
month = aug,
year = "2024",
address = "Bangkok, Thailand",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/jlcl-multiple-ingestion/2024.findings-acl.874/",
doi = "10.18653/v1/2024.findings-acl.874",
pages = "14673--14695",
abstract = "Recent work showed the possibility of building open-vocabulary large language models (LLMs) that directly operate on pixel representations. These models are implemented as autoencoders that reconstruct masked patches of rendered text.However, these pixel-based LLMs are limited to discriminative tasks (e.g., classification) and, similar to BERT, cannot be used to generate text.Therefore, they cannot be used for generative tasks such as free-form question answering. In this work, we introduce PIXAR, the first pixel-based autoregressive LLM that performs text generation. Consisting of only a decoder, PIXAR can perform free-form generative tasks while keeping the number of parameters on par with previous encoder-decoder models.Furthermore, we highlight the challenges of generating text as non-noisy images and show this is due to using a maximum likelihood objective. To overcome this problem, we propose an adversarial pretraining stage that improves the readability and accuracy of PIXAR by 8.1 on LAMBADA and 8.5 on bAbI{---} making it comparable to GPT-2 on text generation tasks.This paves the way to build open-vocabulary LLMs that operate on perceptual input only and calls into question the necessity of the usual symbolic input representation, i.e., text as (sub)tokens."
}
Markdown (Informal)
[PIXAR: Auto-Regressive Language Modeling in Pixel Space](https://preview.aclanthology.org/jlcl-multiple-ingestion/2024.findings-acl.874/) (Tai et al., Findings 2024)
ACL
- Yintao Tai, Xiyang Liao, Alessandro Suglia, and Antonio Vergari. 2024. PIXAR: Auto-Regressive Language Modeling in Pixel Space. In Findings of the Association for Computational Linguistics: ACL 2024, pages 14673–14695, Bangkok, Thailand. Association for Computational Linguistics.