@inproceedings{zhang-etal-2026-deep,
title = "Deep Kernel Fusion for Transformers",
author = "Zhang, Zixi and
Mo, Zhiwen and
Zhao, Yiren and
Mullins, Robert D.",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Proceedings of the 64th Annual Meeting of the {A}ssociation for {C}omputational {L}inguistics (Volume 2: Short Papers)",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl/2026.acl-short.15/",
pages = "166--173",
ISBN = "979-8-89176-391-3",
abstract = "Agentic LLM inference with long contexts is increasingly limited by memory bandwidth rather than compute. In this setting, SwiGLU MLP blocks, whose large weights exceed cache capacity, become a major yet under-optimized bottleneck in the Transformer architecture. We propose DeepFusionKernel, a deeply fused kernel that cuts HBM traffic and boosts cache reuse, delivering up to 13.2{\%} speedup on H100 and 9.7{\%} on A100 over SGLang. Integrated with SGLang and paired with a kernel scheduler, DeepFusionKernel ensures consistent accelerations across generation lengths, while remaining adaptable to diverse models, inference configurations, and hardware platforms."
}Markdown (Informal)
[Deep Kernel Fusion for Transformers](https://preview.aclanthology.org/ingest-acl/2026.acl-short.15/) (Zhang et al., ACL 2026)
ACL
- Zixi Zhang, Zhiwen Mo, Yiren Zhao, and Robert D. Mullins. 2026. Deep Kernel Fusion for Transformers. In Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers), pages 166–173, San Diego, California, United States. Association for Computational Linguistics.