@inproceedings{pietruszka-etal-2022-sparsifying,
title = "Sparsifying Transformer Models with Trainable Representation Pooling",
author = "Pietruszka, Micha{\l} and
Borchmann, {\L}ukasz and
Garncarek, {\L}ukasz",
editor = "Muresan, Smaranda and
Nakov, Preslav and
Villavicencio, Aline",
booktitle = "Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = may,
year = "2022",
address = "Dublin, Ireland",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/fix-sig-urls/2022.acl-long.590/",
doi = "10.18653/v1/2022.acl-long.590",
pages = "8616--8633",
abstract = "We propose a novel method to sparsify attention in the Transformer model by learning to select the most-informative token representations during the training process, thus focusing on the task-specific parts of an input. A reduction of quadratic time and memory complexity to sublinear was achieved due to a robust trainable top-$k$ operator.Our experiments on a challenging long document summarization task show that even our simple baseline performs comparably to the current SOTA, and with trainable pooling we can retain its top quality, while being $1.8\times$ faster during training, $4.5\times$ faster during inference, and up to $13\times$ more computationally efficient in the decoder."
}
Markdown (Informal)
[Sparsifying Transformer Models with Trainable Representation Pooling](https://preview.aclanthology.org/fix-sig-urls/2022.acl-long.590/) (Pietruszka et al., ACL 2022)
ACL