@inproceedings{cserhati-berend-2021-identifying,
title = "Identifying the Importance of Content Overlap for Better Cross-lingual Embedding Mappings",
author = "Cserh{\'a}ti, R{\'e}ka and
Berend, G{\'a}bor",
editor = "Ataman, Duygu and
Birch, Alexandra and
Conneau, Alexis and
Firat, Orhan and
Ruder, Sebastian and
Sahin, Gozde Gul",
booktitle = "Proceedings of the 1st Workshop on Multilingual Representation Learning",
month = nov,
year = "2021",
address = "Punta Cana, Dominican Republic",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/fix-sig-urls/2021.mrl-1.9/",
doi = "10.18653/v1/2021.mrl-1.9",
pages = "96--106",
abstract = "In this work, we analyze the performance and properties of cross-lingual word embedding models created by mapping-based alignment methods. We use several measures of corpus and embedding similarity to predict BLI scores of cross-lingual embedding mappings over three types of corpora, three embedding methods and 55 language pairs. Our experimental results corroborate that instead of mere size, the amount of common content in the training corpora is essential. This phenomenon manifests in that i) despite of the smaller corpus sizes, using only the comparable parts of Wikipedia for training the monolingual embedding spaces to be mapped is often more efficient than relying on all the contents of Wikipedia, ii) the smaller, in return less diversified Spanish Wikipedia works almost always much better as a training corpus for bilingual mappings than the ubiquitously used English Wikipedia."
}
Markdown (Informal)
[Identifying the Importance of Content Overlap for Better Cross-lingual Embedding Mappings](https://preview.aclanthology.org/fix-sig-urls/2021.mrl-1.9/) (Cserháti & Berend, MRL 2021)
ACL