@inproceedings{stringham-izbicki-2020-evaluating,
title = "Evaluating Word Embeddings on Low-Resource Languages",
author = "Stringham, Nathan and
Izbicki, Mike",
editor = "Eger, Steffen and
Gao, Yang and
Peyrard, Maxime and
Zhao, Wei and
Hovy, Eduard",
booktitle = "Proceedings of the First Workshop on Evaluation and Comparison of NLP Systems",
month = nov,
year = "2020",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/jlcl-multiple-ingestion/2020.eval4nlp-1.17/",
doi = "10.18653/v1/2020.eval4nlp-1.17",
pages = "176--186",
abstract = "The analogy task introduced by Mikolov et al. (2013) has become the standard metric for tuning the hyperparameters of word embedding models. In this paper, however, we argue that the analogy task is unsuitable for low-resource languages for two reasons: (1) it requires that word embeddings be trained on large amounts of text, and (2) analogies may not be well-defined in some low-resource settings. We solve these problems by introducing the OddOneOut and Topk tasks, which are specifically designed for model selection in the low-resource setting. We use these metrics to successfully tune hyperparameters for a low-resource emoji embedding task and word embeddings on 16 extinct languages. The largest of these languages (Ancient Hebrew) has a 41 million token dataset, and the smallest (Old Gujarati) has only a 1813 token dataset."
}
Markdown (Informal)
[Evaluating Word Embeddings on Low-Resource Languages](https://preview.aclanthology.org/jlcl-multiple-ingestion/2020.eval4nlp-1.17/) (Stringham & Izbicki, Eval4NLP 2020)
ACL