@inproceedings{gordon-duh-2020-distill,
title = "Distill, Adapt, Distill: Training Small, In-Domain Models for Neural Machine Translation",
author = "Gordon, Mitchell and
Duh, Kevin",
editor = "Birch, Alexandra and
Finch, Andrew and
Hayashi, Hiroaki and
Heafield, Kenneth and
Junczys-Dowmunt, Marcin and
Konstas, Ioannis and
Li, Xian and
Neubig, Graham and
Oda, Yusuke",
booktitle = "Proceedings of the Fourth Workshop on Neural Generation and Translation",
month = jul,
year = "2020",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/jlcl-multiple-ingestion/2020.ngt-1.12/",
doi = "10.18653/v1/2020.ngt-1.12",
pages = "110--118",
abstract = "We explore best practices for training small, memory efficient machine translation models with sequence-level knowledge distillation in the domain adaptation setting. While both domain adaptation and knowledge distillation are widely-used, their interaction remains little understood. Our large-scale empirical results in machine translation (on three language pairs with three domains each) suggest distilling twice for best performance: once using general-domain data and again using in-domain data with an adapted teacher."
}
Markdown (Informal)
[Distill, Adapt, Distill: Training Small, In-Domain Models for Neural Machine Translation](https://preview.aclanthology.org/jlcl-multiple-ingestion/2020.ngt-1.12/) (Gordon & Duh, NGT 2020)
ACL