@inproceedings{starko-rysin-2023-creating,
title = "Creating a {POS} Gold Standard Corpus of {M}odern {U}krainian",
author = "Starko, Vasyl and
Rysin, Andriy",
editor = "Romanyshyn, Mariana",
booktitle = "Proceedings of the Second Ukrainian Natural Language Processing Workshop (UNLP)",
month = may,
year = "2023",
address = "Dubrovnik, Croatia",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/fix-sig-urls/2023.unlp-1.11/",
doi = "10.18653/v1/2023.unlp-1.11",
pages = "91--95",
abstract = "This paper presents an ongoing project to create the Ukrainian Brown Corpus (BRUK), a disambiguated corpus of Modern Ukrainian. Inspired by and loosely based on the original Brown University corpus, BRUK contains one million words, spans 11 years (2010{--}2020), and represents edited written Ukrainian. Using stratified random sampling, we have selected fragments of texts from multiple sources to ensure maximum variety, fill nine predefined categories, and produce a balanced corpus. BRUK has been automatically POS-tagged with the help of our tools (a large morphological dictionary of Ukrainian and a tagger). A manually disambiguated and validated subset of BRUK (450,000 words) has been made available online. This gold standard, the biggest of its kind for Ukrainian, fills a critical need in the NLP ecosystem for this language. The ultimate goal is to produce a fully disambiguated one-million corpus of Modern Ukrainian."
}
Markdown (Informal)
[Creating a POS Gold Standard Corpus of Modern Ukrainian](https://preview.aclanthology.org/fix-sig-urls/2023.unlp-1.11/) (Starko & Rysin, UNLP 2023)
ACL