@inproceedings{kedia-etal-2021-beyond-reptile,
title = "Beyond Reptile: Meta-Learned Dot-Product Maximization between Gradients for Improved Single-Task Regularization",
author = "Kedia, Akhil and
Chinthakindi, Sai Chetan and
Ryu, Wonho",
editor = "Moens, Marie-Francine and
Huang, Xuanjing and
Specia, Lucia and
Yih, Scott Wen-tau",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2021",
month = nov,
year = "2021",
address = "Punta Cana, Dominican Republic",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/fix-sig-urls/2021.findings-emnlp.37/",
doi = "10.18653/v1/2021.findings-emnlp.37",
pages = "407--420",
abstract = "Meta-learning algorithms such as MAML, Reptile, and FOMAML have led to improved performance of several neural models. The primary difference between standard gradient descent and these meta-learning approaches is that they contain as a small component the gradient for maximizing dot-product between gradients of batches, leading to improved generalization. Previous work has shown that aligned gradients are related to generalization, and have also used the Reptile algorithm in a single-task setting to improve generalization. Inspired by these approaches for a single task setting, this paper proposes to use the finite differences first-order algorithm to calculate this gradient from dot-product of gradients, allowing explicit control on the weightage of this component relative to standard gradients. We use this gradient as a regularization technique, leading to more aligned gradients between different batches. By using the finite differences approximation, our approach does not suffer from O(n{\textasciicircum}2) memory usage of naively calculating the Hessian and can be easily applied to large models with large batch sizes. Our approach achieves state-of-the-art performance on the Gigaword dataset, and shows performance improvements on several datasets such as SQuAD-v2.0, Quasar-T, NewsQA and all the SuperGLUE datasets, with a range of models such as BERT, RoBERTa and ELECTRA. Our method also outperforms previous approaches of Reptile and FOMAML when used as a regularization technique, in both single and multi-task settings. Our method is model agnostic, and introduces no extra trainable weights."
}
Markdown (Informal)
[Beyond Reptile: Meta-Learned Dot-Product Maximization between Gradients for Improved Single-Task Regularization](https://preview.aclanthology.org/fix-sig-urls/2021.findings-emnlp.37/) (Kedia et al., Findings 2021)
ACL