@inproceedings{kim-cho-2021-length,
title = "Length-Adaptive Transformer: Train Once with Length Drop, Use Anytime with Search",
author = "Kim, Gyuwan and
Cho, Kyunghyun",
editor = "Zong, Chengqing and
Xia, Fei and
Li, Wenjie and
Navigli, Roberto",
booktitle = "Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing (Volume 1: Long Papers)",
month = aug,
year = "2021",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/fix-sig-urls/2021.acl-long.508/",
doi = "10.18653/v1/2021.acl-long.508",
pages = "6501--6511",
abstract = "Despite transformers' impressive accuracy, their computational cost is often prohibitive to use with limited computational resources. Most previous approaches to improve inference efficiency require a separate model for each possible computational budget. In this paper, we extend PoWER-BERT (Goyal et al., 2020) and propose Length-Adaptive Transformer that can be used for various inference scenarios after one-shot training. We train a transformer with LengthDrop, a structural variant of dropout, which stochastically determines a sequence length at each layer. We then conduct a multi-objective evolutionary search to find a length configuration that maximizes the accuracy and minimizes the efficiency metric under any given computational budget. Additionally, we significantly extend the applicability of PoWER-BERT beyond sequence-level classification into token-level classification with Drop-and-Restore process that drops word-vectors temporarily in intermediate layers and restores at the last layer if necessary. We empirically verify the utility of the proposed approach by demonstrating the superior accuracy-efficiency trade-off under various setups, including span-based question answering and text classification. Code is available at \url{https://github.com/clovaai/lengthadaptive-transformer}."
}
Markdown (Informal)
[Length-Adaptive Transformer: Train Once with Length Drop, Use Anytime with Search](https://preview.aclanthology.org/fix-sig-urls/2021.acl-long.508/) (Kim & Cho, ACL-IJCNLP 2021)
ACL