@inproceedings{wu-etal-2021-tentrans,
title = "{T}en{T}rans High-Performance Inference Toolkit for {WMT}2021 Efficiency Task",
author = "Wu, Kaixin and
Hu, Bojie and
Ju, Qi",
editor = "Barrault, Loic and
Bojar, Ondrej and
Bougares, Fethi and
Chatterjee, Rajen and
Costa-jussa, Marta R. and
Federmann, Christian and
Fishel, Mark and
Fraser, Alexander and
Freitag, Markus and
Graham, Yvette and
Grundkiewicz, Roman and
Guzman, Paco and
Haddow, Barry and
Huck, Matthias and
Yepes, Antonio Jimeno and
Koehn, Philipp and
Kocmi, Tom and
Martins, Andre and
Morishita, Makoto and
Monz, Christof",
booktitle = "Proceedings of the Sixth Conference on Machine Translation",
month = nov,
year = "2021",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/add-emnlp-2024-awards/2021.wmt-1.77/",
pages = "795--798",
abstract = "The paper describes the TenTrans`s submissions to the WMT 2021 Efficiency Shared Task. We explore training a variety of smaller compact transformer models using the teacher-student setup. Our model is trained by our self-developed open-source multilingual training platform TenTrans-Py. We also release an open-source high-performance inference toolkit for transformer models and the code is written in C++ completely. All additional optimizations are built on top of the inference engine including attention caching, kernel fusion, early-stop, and several other optimizations. In our submissions, the fastest system can translate more than 22,000 tokens per second with a single Tesla P4 while maintaining 38.36 BLEU on En-De newstest2019. Our trained models and more details are available in TenTrans-Decoding competition examples."
}
Markdown (Informal)
[TenTrans High-Performance Inference Toolkit for WMT2021 Efficiency Task](https://preview.aclanthology.org/add-emnlp-2024-awards/2021.wmt-1.77/) (Wu et al., WMT 2021)
ACL