@inproceedings{yeshpanov-2026-100000,
title = "100,000+ Movie Reviews from {K}azakhstan: {R}ussian, {K}azakh, and Code-Switched Texts",
author = "Yeshpanov, Rustem",
editor = {Hamilton, Sil and
{\"O}hman, Emily and
Hicke, Rebecca M. M. and
Bizzoni, Yuri and
Bax, Axel and
Matthews, Jacob A. and
H{\"a}m{\"a}l{\"a}inen, Mika},
booktitle = "Proceedings of the 6th International Conference on Natural Language Processing for the Digital Humanities",
month = jul,
year = "2026",
address = "San Diego, USA",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl-workshops/2026.nlp4dh-1.4/",
pages = "31--40",
ISBN = "979-8-89176-427-9",
abstract = "We present a new publicly available corpus of 100,502 movie reviews from Kazakhstan collected from kino.kz, spanning 2001{--}2025 and covering 4,943 unique titles. The dataset is multilingual, consisting mainly of Russian reviews alongside Kazakh and code-switched texts. Reviews are manually annotated for language and sentiment polarity, and 11,309 reviews additionally contain explicit user-provided ratings. We define two sentiment tasks{---}three-way polarity classification and five-class score classification{---}and benchmark classical BoW/TF{--}IDF baselines against multilingual transformer models (mBERT, XLM-RoBERTa, RemBERT). Experimental results show that transformer models consistently outperform classical baselines on polarity classification, while score classification remains challenging under leakage-controlled evaluation due to severe class imbalance and subtle distinctions between adjacent rating levels."
}Markdown (Informal)
[100,000+ Movie Reviews from Kazakhstan: Russian, Kazakh, and Code-Switched Texts](https://preview.aclanthology.org/ingest-acl-workshops/2026.nlp4dh-1.4/) (Yeshpanov, NLP4DH 2026)
ACL