@inproceedings{stringham-izbicki-2020-evaluating,
title = "Evaluating Word Embeddings on Low-Resource Languages",
author = "Stringham, Nathan and
Izbicki, Mike",
booktitle = "Proceedings of the First Workshop on Evaluation and Comparison of NLP Systems",
month = nov,
year = "2020",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2020.eval4nlp-1.17",
doi = "10.18653/v1/2020.eval4nlp-1.17",
pages = "176--186",
abstract = "The analogy task introduced by Mikolov et al. (2013) has become the standard metric for tuning the hyperparameters of word embedding models. In this paper, however, we argue that the analogy task is unsuitable for low-resource languages for two reasons: (1) it requires that word embeddings be trained on large amounts of text, and (2) analogies may not be well-defined in some low-resource settings. We solve these problems by introducing the OddOneOut and Topk tasks, which are specifically designed for model selection in the low-resource setting. We use these metrics to successfully tune hyperparameters for a low-resource emoji embedding task and word embeddings on 16 extinct languages. The largest of these languages (Ancient Hebrew) has a 41 million token dataset, and the smallest (Old Gujarati) has only a 1813 token dataset.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="stringham-izbicki-2020-evaluating">
<titleInfo>
<title>Evaluating Word Embeddings on Low-Resource Languages</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nathan</namePart>
<namePart type="family">Stringham</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mike</namePart>
<namePart type="family">Izbicki</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-nov</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the First Workshop on Evaluation and Comparison of NLP Systems</title>
</titleInfo>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>The analogy task introduced by Mikolov et al. (2013) has become the standard metric for tuning the hyperparameters of word embedding models. In this paper, however, we argue that the analogy task is unsuitable for low-resource languages for two reasons: (1) it requires that word embeddings be trained on large amounts of text, and (2) analogies may not be well-defined in some low-resource settings. We solve these problems by introducing the OddOneOut and Topk tasks, which are specifically designed for model selection in the low-resource setting. We use these metrics to successfully tune hyperparameters for a low-resource emoji embedding task and word embeddings on 16 extinct languages. The largest of these languages (Ancient Hebrew) has a 41 million token dataset, and the smallest (Old Gujarati) has only a 1813 token dataset.</abstract>
<identifier type="citekey">stringham-izbicki-2020-evaluating</identifier>
<identifier type="doi">10.18653/v1/2020.eval4nlp-1.17</identifier>
<location>
<url>https://aclanthology.org/2020.eval4nlp-1.17</url>
</location>
<part>
<date>2020-nov</date>
<extent unit="page">
<start>176</start>
<end>186</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Evaluating Word Embeddings on Low-Resource Languages
%A Stringham, Nathan
%A Izbicki, Mike
%S Proceedings of the First Workshop on Evaluation and Comparison of NLP Systems
%D 2020
%8 nov
%I Association for Computational Linguistics
%C Online
%F stringham-izbicki-2020-evaluating
%X The analogy task introduced by Mikolov et al. (2013) has become the standard metric for tuning the hyperparameters of word embedding models. In this paper, however, we argue that the analogy task is unsuitable for low-resource languages for two reasons: (1) it requires that word embeddings be trained on large amounts of text, and (2) analogies may not be well-defined in some low-resource settings. We solve these problems by introducing the OddOneOut and Topk tasks, which are specifically designed for model selection in the low-resource setting. We use these metrics to successfully tune hyperparameters for a low-resource emoji embedding task and word embeddings on 16 extinct languages. The largest of these languages (Ancient Hebrew) has a 41 million token dataset, and the smallest (Old Gujarati) has only a 1813 token dataset.
%R 10.18653/v1/2020.eval4nlp-1.17
%U https://aclanthology.org/2020.eval4nlp-1.17
%U https://doi.org/10.18653/v1/2020.eval4nlp-1.17
%P 176-186
Markdown (Informal)
[Evaluating Word Embeddings on Low-Resource Languages](https://aclanthology.org/2020.eval4nlp-1.17) (Stringham & Izbicki, Eval4NLP 2020)
ACL