@article{yogatama-etal-2021-adaptive,
title = "Adaptive Semiparametric Language Models",
author = "Yogatama, Dani and
de Masson d{'}Autume, Cyprien and
Kong, Lingpeng",
editor = "Roark, Brian and
Nenkova, Ani",
journal = "Transactions of the Association for Computational Linguistics",
volume = "9",
year = "2021",
address = "Cambridge, MA",
publisher = "MIT Press",
url = "https://preview.aclanthology.org/fix-sig-urls/2021.tacl-1.22/",
doi = "10.1162/tacl_a_00371",
pages = "362--373",
abstract = "We present a language model that combines a large parametric neural network (i.e., a transformer) with a non-parametric episodic memory component in an integrated architecture. Our model uses extended short-term context by caching local hidden states{---}similar to transformer-XL{---}and global long-term memory by retrieving a set of nearest neighbor tokens at each timestep. We design a gating function to adaptively combine multiple information sources to make a prediction. This mechanism allows the model to use either local context, short-term memory, or long-term memory (or any combination of them) on an ad hoc basis depending on the context. Experiments on word-based and character-based language modeling datasets demonstrate the efficacy of our proposed method compared to strong baselines."
}
Markdown (Informal)
[Adaptive Semiparametric Language Models](https://preview.aclanthology.org/fix-sig-urls/2021.tacl-1.22/) (Yogatama et al., TACL 2021)
ACL