@inproceedings{liu-etal-2025-document,
title = "Document-level Simplification and Illustration Generation Multimodal Coherence",
author = "Liu, Yuhang and
Zhang, Mo and
Cheng, Zhaoyi and
Ebling, Sarah",
editor = "Shardlow, Matthew and
Alva-Manchego, Fernando and
North, Kai and
Stodden, Regina and
Saggion, Horacio and
Khallaf, Nouran and
Hayakawa, Akio",
booktitle = "Proceedings of the Fourth Workshop on Text Simplification, Accessibility and Readability (TSAR 2025)",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-emnlp/2025.tsar-1.2/",
pages = "19--35",
ISBN = "979-8-89176-176-6",
abstract = "We present a novel method for document-level text simplification and automatic illustration generation aimed at enhancing information accessibility for individuals with cognitive impairments. While prior research has primarily focused on sentence- or paragraph-level simplification and text-to-image generation for narrative contexts this work addresses the unique challenges of simplifying long-form documents and generating semantically aligned visuals. The pipeline consists of three stages (1) discourse-aware segmentation using large language models (2) visually grounded description generation via abstraction and (3) controlled image synthesis using state-of-the-art diffusion models including DALLE 3 and FLUX1-dev. We further incorporate stylistic constraints to ensure visual coherence and we conduct a human evaluation measuring comprehension semantic alignment and visual clarity. Experimental results demonstrate that our method effectively combines simplified text and visual content with generated illustrations enhancing textual accessibility."
}Markdown (Informal)
[Document-level Simplification and Illustration Generation Multimodal Coherence](https://preview.aclanthology.org/ingest-emnlp/2025.tsar-1.2/) (Liu et al., TSAR 2025)
ACL