@inproceedings{geva-etal-2021-whats,
title = "{W}hat{'}s in Your Head? {E}mergent Behaviour in Multi-Task Transformer Models",
author = "Geva, Mor and
Katz, Uri and
Ben-Arie, Aviv and
Berant, Jonathan",
editor = "Moens, Marie-Francine and
Huang, Xuanjing and
Specia, Lucia and
Yih, Scott Wen-tau",
booktitle = "Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2021",
address = "Online and Punta Cana, Dominican Republic",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/fix-sig-urls/2021.emnlp-main.646/",
doi = "10.18653/v1/2021.emnlp-main.646",
pages = "8201--8215",
abstract = "The primary paradigm for multi-task training in natural language processing is to represent the input with a shared pre-trained language model, and add a small, thin network (head) per task. Given an input, a target head is the head that is selected for outputting the final prediction. In this work, we examine the behaviour of non-target heads, that is, the output of heads when given input that belongs to a different task than the one they were trained for. We find that non-target heads exhibit emergent behaviour, which may either explain the target task, or generalize beyond their original task. For example, in a numerical reasoning task, a span extraction head extracts from the input the arguments to a computation that results in a number generated by a target generative head. In addition, a summarization head that is trained with a target question answering head, outputs query-based summaries when given a question and a context from which the answer is to be extracted. This emergent behaviour suggests that multi-task training leads to non-trivial extrapolation of skills, which can be harnessed for interpretability and generalization."
}
Markdown (Informal)
[What’s in Your Head? Emergent Behaviour in Multi-Task Transformer Models](https://preview.aclanthology.org/fix-sig-urls/2021.emnlp-main.646/) (Geva et al., EMNLP 2021)
ACL