@inproceedings{galle-2019-investigating,
    title = "Investigating the Effectiveness of {BPE}: The Power of Shorter Sequences",
    author = "Gall{\'e}, Matthias",
    editor = "Inui, Kentaro  and
      Jiang, Jing  and
      Ng, Vincent  and
      Wan, Xiaojun",
    booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP)",
    month = nov,
    year = "2019",
    address = "Hong Kong, China",
    publisher = "Association for Computational Linguistics",
    url = "https://preview.aclanthology.org/ingest-emnlp/D19-1141/",
    doi = "10.18653/v1/D19-1141",
    pages = "1375--1381",
    abstract = "Byte-Pair Encoding (BPE) is an unsupervised sub-word tokenization technique, commonly used in neural machine translation and other NLP tasks. Its effectiveness makes it a de facto standard, but the reasons for this are not well understood. We link BPE to the broader family of dictionary-based compression algorithms and compare it with other members of this family. Our experiments across datasets, language pairs, translation models, and vocabulary size show that - given a fixed vocabulary size budget - the fewer tokens an algorithm needs to cover the test set, the better the translation (as measured by BLEU)."
}Markdown (Informal)
[Investigating the Effectiveness of BPE: The Power of Shorter Sequences](https://preview.aclanthology.org/ingest-emnlp/D19-1141/) (Gallé, EMNLP-IJCNLP 2019)
ACL
- Matthias Gallé. 2019. Investigating the Effectiveness of BPE: The Power of Shorter Sequences. In Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP), pages 1375–1381, Hong Kong, China. Association for Computational Linguistics.