@inproceedings{arora-etal-2019-lstm,
title = "Does an {LSTM} forget more than a {CNN}? An empirical study of catastrophic forgetting in {NLP}",
author = "Arora, Gaurav and
Rahimi, Afshin and
Baldwin, Timothy",
editor = "Mistica, Meladel and
Piccardi, Massimo and
MacKinlay, Andrew",
booktitle = "Proceedings of the 17th Annual Workshop of the Australasian Language Technology Association",
month = "4--6 " # dec,
year = "2019",
address = "Sydney, Australia",
publisher = "Australasian Language Technology Association",
url = "https://preview.aclanthology.org/add-emnlp-2024-awards/U19-1011/",
pages = "77--86",
abstract = "Catastrophic forgetting {---} whereby a model trained on one task is fine-tuned on a second, and in doing so, suffers a {\textquotedblleft}catastrophic{\textquotedblright} drop in performance over the first task {---} is a hurdle in the development of better transfer learning techniques. Despite impressive progress in reducing catastrophic forgetting, we have limited understanding of how different architectures and hyper-parameters affect forgetting in a network. With this study, we aim to understand factors which cause forgetting during sequential training. Our primary finding is that CNNs forget less than LSTMs. We show that max-pooling is the underlying operation which helps CNNs alleviate forgetting compared to LSTMs. We also found that curriculum learning, placing a hard task towards the end of task sequence, reduces forgetting. We analysed the effect of fine-tuning contextual embeddings on catastrophic forgetting and found that using embeddings as feature extractor is preferable to fine-tuning in continual learning setup."
}
Markdown (Informal)
[Does an LSTM forget more than a CNN? An empirical study of catastrophic forgetting in NLP](https://preview.aclanthology.org/add-emnlp-2024-awards/U19-1011/) (Arora et al., ALTA 2019)
ACL