@inproceedings{kim-etal-2024-kofren,
title = "{K}o{FREN}: Comprehensive {K}orean Word Frequency Norms Derived from Large Scale Free Speech Corpora",
author = "Kim, Jin-seo and
Choi, Anna Seo Gyeong and
Cho, Sunghye",
editor = "Calzolari, Nicoletta and
Kan, Min-Yen and
Hoste, Veronique and
Lenci, Alessandro and
Sakti, Sakriani and
Xue, Nianwen",
booktitle = "Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)",
month = may,
year = "2024",
address = "Torino, Italia",
publisher = "ELRA and ICCL",
url = "https://preview.aclanthology.org/add-emnlp-2024-awards/2024.lrec-main.866/",
pages = "9926--9931",
abstract = "Word frequencies are integral in linguistic studies, showing strong correlations with speakers' cognitive abilities and other important linguistic parameters including the Age of Acquisition (AoA). However, the formulation of credible Korean word frequency norms has been obstructed by the lack of expansive speech data and a reliable part-ofspeech (POS) tagger. In this study, we unveil Korean word frequency norms (KoFREN), derived from large-scale spontaneous speech corpora (41 million words) that include a balanced representation of gender and age. We employed a machine learning-powered POS tagger, showcasing accuracy on par with human annotators. Our frequency norms correlate significantly with external studies' lexical decision time (LDT) and AoA measures. KoFREN also aligns with English counterparts sourced from SUBTLEX{\_}US - an English word frequency measure that has been frequently used in the literature. KoFREN is poised to facilitate research in spontaneous Contemporary Korean and can be utilized in many fields, including clinical studies of Korean patients."
}
Markdown (Informal)
[KoFREN: Comprehensive Korean Word Frequency Norms Derived from Large Scale Free Speech Corpora](https://preview.aclanthology.org/add-emnlp-2024-awards/2024.lrec-main.866/) (Kim et al., LREC-COLING 2024)
ACL