@inproceedings{lin-etal-2020-pruning,
title = "Pruning Redundant Mappings in Transformer Models via Spectral-Normalized Identity Prior",
author = "Lin, Zi and
Liu, Jeremiah and
Yang, Zi and
Hua, Nan and
Roth, Dan",
editor = "Cohn, Trevor and
He, Yulan and
Liu, Yang",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2020",
month = nov,
year = "2020",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/jlcl-multiple-ingestion/2020.findings-emnlp.64/",
doi = "10.18653/v1/2020.findings-emnlp.64",
pages = "719--730",
abstract = "Traditional (unstructured) pruning methods for a Transformer model focus on regularizing the individual weights by penalizing them toward zero. In this work, we explore spectral-normalized identity priors (SNIP), a structured pruning approach which penalizes an entire residual module in a Transformer model toward an identity mapping. Our method identifies and discards unimportant non-linear mappings in the residual connections by applying a thresholding operator on the function norm, and is applicable to any structured module including a single attention head, an entire attention blocks, or a feed-forward subnetwork. Furthermore, we introduce spectral normalization to stabilize the distribution of the post-activation values of the Transformer layers, further improving the pruning effectiveness of the proposed methodology. We conduct experiments with BERT on 5 GLUE benchmark tasks to demonstrate that SNIP achieves effective pruning results while maintaining comparable performance. Specifically, we improve the performance over the state-of-the-art by 0.5 to 1.0{\%} on average at 50{\%} compression ratio."
}
Markdown (Informal)
[Pruning Redundant Mappings in Transformer Models via Spectral-Normalized Identity Prior](https://preview.aclanthology.org/jlcl-multiple-ingestion/2020.findings-emnlp.64/) (Lin et al., Findings 2020)
ACL