@inproceedings{chubakov-2026-kyrtext,
title = "{K}yr{T}ext: A Multi-Domain Large-Scale Corpus for {K}yrgyz Language",
author = "Chubakov, Tilek",
editor = "Hettiarachchi, Hansi and
Ranasinghe, Tharindu and
Plum, Alistair and
Rayson, Paul and
Mitkov, Ruslan and
Gaber, Mohamed and
Premasiri, Damith and
Tan, Fiona Anting and
Uyangodage, Lasitha",
booktitle = "Proceedings of the Second Workshop on Language Models for Low-Resource Languages ({L}o{R}es{LM} 2026)",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/manual-author-scripts/2026.loreslm-1.39/",
pages = "453--460",
ISBN = "979-8-89176-377-7",
abstract = "Kyrgyz is a morphologically rich Turkic language that remains significantly underrepresented in modern multilingual language models. To address this resource gap, we introduce KyrText, a diverse, large-scale corpus containing 680.5 million words. Unlike existing web-crawled datasets which are often noisy or misidentified, KyrText aggregates high-quality news, Wikipedia entries, digitized literature, and extensive legal archives from the Supreme Court and Ministry of Justice of the Kyrgyz Republic. We leverage this corpus for the continual pre-training of mBERT, XLM-R, and DeBERTaV3, while also training RoBERTa architectures from scratch.Evaluations across several bench marks{---}including natural language inference (XNLI), question answering (BoolQ), sentiment analysis (SST-2), and paraphrase identification (PAWS-X){---}demonstrate that targeted pre-training on KyrText yields substantial performance improvements over baseline multilingual models.Our findings indicate that while base-sized models benefit immediately from this domain-specific data, larger architectures require more extensive training cycles to fully realize their potential. We release our corpus and suite of models to establish a new foundation for Kyrgyz Natural Language Processing."
}Markdown (Informal)
[KyrText: A Multi-Domain Large-Scale Corpus for Kyrgyz Language](https://preview.aclanthology.org/manual-author-scripts/2026.loreslm-1.39/) (Chubakov, LoResLM 2026)
ACL