@inproceedings{chang-basit-2025-many,
title = "How many words does it take to understand a low-resource language?",
author = "Chang, Emily and
Basit, Nada",
editor = "Ebrahimi, Abteen and
Haider, Samar and
Liu, Emmy and
Haider, Sammar and
Leonor Pacheco, Maria and
Wein, Shira",
booktitle = "Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 4: Student Research Workshop)",
month = apr,
year = "2025",
address = "Albuquerque, USA",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/fix-sig-urls/2025.naacl-srw.21/",
pages = "207--224",
ISBN = "979-8-89176-192-6",
abstract = "When developing language technology, researchers have routinely turned to transfer learning to resolve the data scarcity conundrum presented in low-resource languages. As far as we know, this study is the first to evaluate the amount of documentation needed for transfer learning, specifically the smallest vocabulary size needed to create a sentence embedding space. In adopting widely spoken languages as a proxy for low-resource languages, our experiments show that the relationship between a sentence embedding{'}s vocabulary size and performance is logarithmic with performance leveling at a vocabulary size of 25,000. It should be noted that this relationship cannot be replicated across all languages and this level of documentation does not exist for many low-resource languages. We do observe, however, that performance accelerates at a vocabulary size of $\le$ 1000, a quantity that is present in most low-resource language documentation. These results can aid researchers in understanding whether a low-resource language has enough documentation necessary to support the creation of a sentence embedding and language model."
}
Markdown (Informal)
[How many words does it take to understand a low-resource language?](https://preview.aclanthology.org/fix-sig-urls/2025.naacl-srw.21/) (Chang & Basit, NAACL 2025)
ACL
- Emily Chang and Nada Basit. 2025. How many words does it take to understand a low-resource language?. In Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 4: Student Research Workshop), pages 207–224, Albuquerque, USA. Association for Computational Linguistics.