@inproceedings{liang-etal-2025-waffle,
title = "{WAFFLE}: Fine-tuning Multi-Modal Model for Automated Front-End Development",
author = "Liang, Shanchao and
Jiang, Nan and
Qian, Shangshu and
Tan, Lin",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingestion-acl-25/2025.acl-long.1208/",
pages = "24786--24802",
ISBN = "979-8-89176-251-0",
abstract = "Web development involves turning UI designs into functional webpages, which can be difficult for both beginners and experienced developers due to the complexity of HTML{'}s hierarchical structures and styles. While Large Language Models (LLMs) have shown promise in generating source code, two major challenges persist in UI-to-HTML code generation: (1) effectively representing HTML{'}s hierarchical structure for LLMs, and (2) bridging the gap between the visual nature of UI designs and the text-based format of HTML code. To tackle these challenges, we introduce Waffle, a new fine-tuning strategy that uses a structure-aware attention mechanism to improve LLMs' understanding of HTML{'}s structure and a contrastive fine-tuning approach to align LLMs' understanding of UI images and HTML code. Models fine-tuned with Waffle show up to 9.00 pp (percentage point) higher HTML match, 0.0982 higher CW-SSIM, 32.99 higher CLIP, and 27.12 pp higher LLEM on our new benchmark WebSight-Test and an existing benchmark Design2Code, outperforming current fine-tuning methods."
}
Markdown (Informal)
[WAFFLE: Fine-tuning Multi-Modal Model for Automated Front-End Development](https://preview.aclanthology.org/ingestion-acl-25/2025.acl-long.1208/) (Liang et al., ACL 2025)
ACL