@inproceedings{yu-etal-2023-trams,
title = "{TRAMS}: Training-free Memory Selection for Long-range Language Modeling",
author = "Yu, Haofei and
Wang, Cunxiang and
Zhang, Yue and
Bi, Wei",
editor = "Bouamor, Houda and
Pino, Juan and
Bali, Kalika",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2023",
month = dec,
year = "2023",
address = "Singapore",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/fix-sig-urls/2023.findings-emnlp.331/",
doi = "10.18653/v1/2023.findings-emnlp.331",
pages = "4966--4972",
abstract = "The Transformer architecture is crucial for numerous AI models, but it still faces challenges in long-range language modeling. Though several specific transformer architectures have been designed to tackle issues of long-range dependencies, existing methods like Transformer-XL are plagued by a high percentage of ineffective memories. In this study, we present a plug-and-play strategy, known as TRAining-free Memory Selection (TRAMS), that selects tokens participating in attention calculation based on one simple metric. This strategy allows us to keep tokens that are likely to have a high attention score with the current queries and ignore the other ones. We have tested our approach on the word-level benchmark (WikiText-103) and the character-level benchmark (enwik8), and the results indicate an improvement without having additional training or adding additional parameters."
}
Markdown (Informal)
[TRAMS: Training-free Memory Selection for Long-range Language Modeling](https://preview.aclanthology.org/fix-sig-urls/2023.findings-emnlp.331/) (Yu et al., Findings 2023)
ACL