@article{yang-etal-2026-simulating,
title = "Simulating Hard Attention Using Soft Attention",
author = "Yang, Andy and
Strobl, Lena and
Chiang, David and
Angluin, Dana",
journal = "Transactions of the Association for Computational Linguistics",
volume = "14",
year = "2026",
address = "Cambridge, MA",
publisher = "MIT Press",
url = "https://preview.aclanthology.org/fix-opsupmap-display/2026.tacl-1.8/",
doi = "10.1162/tacl.a.597",
pages = "147--166",
abstract = "We study conditions under which transformers using soft attention can simulate hard attention, that is, effectively focus all attention on a subset of positions. First, we examine several subclasses of languages recognized by hard-attention transformers, which can be defined in variants of linear temporal logic. We demonstrate how soft-attention transformers can compute formulas of these logics using unbounded positional embeddings or temperature scaling. Second, we demonstrate how temperature scaling allows softmax transformers to simulate general hard-attention transformers, using a temperature that depends on the minimum gap between the maximum attention scores and other attention scores."
}Markdown (Informal)
[Simulating Hard Attention Using Soft Attention](https://preview.aclanthology.org/fix-opsupmap-display/2026.tacl-1.8/) (Yang et al., TACL 2026)
ACL