@inproceedings{yao-etal-2026-edgeformer,
title = "{E}dge{F}ormer: Latency-Aware Collaborative Multi-Head Attention of Transformer Inference in Edge Networks",
author = "Yao, Yiming and
Niu, Jianwei and
Dai, Bin and
Ren, Tao",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Proceedings of the 64th Annual Meeting of the {A}ssociation for {C}omputational {L}inguistics (Volume 1: Long Papers)",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl/2026.acl-long.2007/",
pages = "43346--43361",
ISBN = "979-8-89176-390-6",
abstract = "Recent breakthroughs in Transformer-based large models, have driven widespread tasks, yet their reliance on centralized cloud deployment raises significant privacy risks due to sensitive data exposure. While edge-based collaborative inference offers a privacy-preserving alternative, existing methods face critical limitations: static model partitioning cannot adapt to dynamic edge resource fluctuations, and rigid multi-head attention handling overlooks semantic-critical prioritization and parallelism. We propose EdgeFormer, a latency-aware framework for distributed Transformer inference in resource-constrained edge networks. EdgeFormer dynamically allocates model blocks across devices via efficiency-storage trade-off optimization and introduces collaborative Multi-Head Attention (cMHA), which distributes semantic-critical attention heads across devices while pruning redundant ones under real-time constraints. We further develop LiScore, a composite metric integrating attention diversity and latency costs, alongside a similarity-based retrieval method to reduce recomputation overhead. Extensive experiments demonstrate that EdgeFormer achieves up to 2.01 $\\times$ inference acceleration over state-of-the-art baselines with $\\leq$1.06{\%} accuracy loss, maintaining robustness under varying edge conditions."
}Markdown (Informal)
[EdgeFormer: Latency-Aware Collaborative Multi-Head Attention of Transformer Inference in Edge Networks](https://preview.aclanthology.org/ingest-acl/2026.acl-long.2007/) (Yao et al., ACL 2026)
ACL