@inproceedings{zhang-zhou-2025-continuous,
title = "Continuous-Time Attention: {PDE}-Guided Mechanisms for Long-Sequence Transformers",
author = "Zhang, Yukun and
Zhou, Xueqing",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/name-variant-enfa-fane/2025.emnlp-main.1097/",
doi = "10.18653/v1/2025.emnlp-main.1097",
pages = "21654--21674",
ISBN = "979-8-89176-332-6",
abstract = "We present Continuous-Time Attention, a novel framework that infuses partial differential equations (PDEs) into the Transformer{'}s attention mechanism to better handle long sequences. Instead of relying on a static attention matrix, we allow attention weights to evolve along a pseudo-time dimension governed by diffusion, wave, or reaction-diffusion dynamics. This dynamic process systematically smooths local noise, strengthens long-range dependencies, and improves gradient stability during training.Our theoretical analysis shows that PDE-driven attention mitigates the exponential decay of distant interactions and improves the optimization landscape. Empirically, Continuous-Time Attention achieves consistent performance gains over both standard and long-sequence Transformer variants across a range of tasks. These results suggest that embedding continuous-time dynamics into attention mechanisms is a promising direction for enhancing global coherence and scalability in Transformer models. Code is publicly available at:https://github.com/XueqingZhou/Continuous-Time-Attention"
}Markdown (Informal)
[Continuous-Time Attention: PDE-Guided Mechanisms for Long-Sequence Transformers](https://preview.aclanthology.org/name-variant-enfa-fane/2025.emnlp-main.1097/) (Zhang & Zhou, EMNLP 2025)
ACL