@inproceedings{heffernan-etal-2022-bitext,
title = "Bitext Mining Using Distilled Sentence Representations for Low-Resource Languages",
author = "Heffernan, Kevin and
{\c{C}}elebi, Onur and
Schwenk, Holger",
editor = "Goldberg, Yoav and
Kozareva, Zornitsa and
Zhang, Yue",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2022",
month = dec,
year = "2022",
address = "Abu Dhabi, United Arab Emirates",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/Add-Cong-Liu-Florida-Atlantic-University-author-id/2022.findings-emnlp.154/",
doi = "10.18653/v1/2022.findings-emnlp.154",
pages = "2101--2112",
abstract = "Scaling multilingual representation learning beyond the hundred most frequent languages is challenging, in particular to cover the long tail of low-resource languages. We move away from the popular one-for-all multilingual models and focus on training multiple language (family) specific representations, but most prominently enable all languages to still be encoded in the same representational space. We focus on teacher-student training, allowing all encoders to be mutually compatible for bitext mining, and enabling fast learning of new languages. We also combine supervised and self-supervised training, allowing encoders to take advantage of monolingual training data.Our approach significantly outperforms the original LASER encoder. We study very low-resource languages and handle 44 African languages, many of which are not covered by any other model. For these languages, we train sentence encoders and mine bitexts. Adding these mined bitexts yielded an improvement of 3.8 BLEU for NMT into English."
}
Markdown (Informal)
[Bitext Mining Using Distilled Sentence Representations for Low-Resource Languages](https://preview.aclanthology.org/Add-Cong-Liu-Florida-Atlantic-University-author-id/2022.findings-emnlp.154/) (Heffernan et al., Findings 2022)
ACL