@inproceedings{dufter-etal-2020-increasing,
title = "Increasing Learning Efficiency of Self-Attention Networks through Direct Position Interactions, Learnable Temperature, and Convoluted Attention",
author = {Dufter, Philipp and
Schmitt, Martin and
Sch{\"u}tze, Hinrich},
editor = "Scott, Donia and
Bel, Nuria and
Zong, Chengqing",
booktitle = "Proceedings of the 28th International Conference on Computational Linguistics",
month = dec,
year = "2020",
address = "Barcelona, Spain (Online)",
publisher = "International Committee on Computational Linguistics",
url = "https://preview.aclanthology.org/fix-sig-urls/2020.coling-main.324/",
doi = "10.18653/v1/2020.coling-main.324",
pages = "3630--3636",
abstract = "Self-Attention Networks (SANs) are an integral part of successful neural architectures such as Transformer (Vaswani et al., 2017), and thus of pretrained language models such as BERT (Devlin et al., 2019) or GPT-3 (Brown et al., 2020). Training SANs on a task or pretraining them on language modeling requires large amounts of data and compute resources. We are searching for modifications to SANs that enable faster learning, i.e., higher accuracies after fewer update steps. We investigate three modifications to SANs: direct position interactions, learnable temperature, and convoluted attention. When evaluating them on part-of-speech tagging, we find that direct position interactions are an alternative to position embeddings, and convoluted attention has the potential to speed up the learning process."
}
Markdown (Informal)
[Increasing Learning Efficiency of Self-Attention Networks through Direct Position Interactions, Learnable Temperature, and Convoluted Attention](https://preview.aclanthology.org/fix-sig-urls/2020.coling-main.324/) (Dufter et al., COLING 2020)
ACL