@inproceedings{fernando-dias-2021-building,
title = "Building a Linguistic Resource : A Word Frequency List for {S}inhala",
author = "Fernando, Aloka and
Dias, Gihan",
editor = "Bandyopadhyay, Sivaji and
Devi, Sobha Lalitha and
Bhattacharyya, Pushpak",
booktitle = "Proceedings of the 18th International Conference on Natural Language Processing (ICON)",
month = dec,
year = "2021",
address = "National Institute of Technology Silchar, Silchar, India",
publisher = "NLP Association of India (NLPAI)",
url = "https://preview.aclanthology.org/fix-sig-urls/2021.icon-main.74/",
pages = "606--610",
abstract = "A word frequency list is a list of unique words in a language along with their frequency count. It is generally sorted by frequency. Such a list is essential for many NLP tasks, including building language models, POS taggers, spelling checkers, word separation guides, etc., in addition to assisting language learners. Such lists are available for many languages, but a large-scale word list is still not available for Sinhala. We have developed a comprehensive list of words, together with their frequency and part-of-speech (POS), from a large textbase. Unlike many other such lists, our list includes a large number of low-frequency words (many of which are erroneous), which enables the analysis of such words, including the frequencies of errors. In addition to the main list, we have also prepared a list of linguistically verified words. The word frequency list and the verified word list are the largest collections of words lists that are available for the Sinhala language."
}
Markdown (Informal)
[Building a Linguistic Resource : A Word Frequency List for Sinhala](https://preview.aclanthology.org/fix-sig-urls/2021.icon-main.74/) (Fernando & Dias, ICON 2021)
ACL