@inproceedings{matyjaszek-2026-pioneering,
title = "Pioneering Bot Detection on {P}olish {R}eddit at the Comment Level",
author = "Matyjaszek, Karmela",
editor = "Baez Santamaria, Selene and
Somayajula, Sai Ashish and
Yamaguchi, Atsuki",
booktitle = "Proceedings of the 19th Conference of the {E}uropean Chapter of the {A}ssociation for {C}omputational {L}inguistics (Volume 4: Student Research Workshop)",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-eacl/2026.eacl-srw.65/",
pages = "881--894",
ISBN = "979-8-89176-383-8",
abstract = "Research on bot detection in social media exhibits imbalance in several areas {---} across platforms, languages, and detection levels. Addressing these gaps, this study focuses on comment-level bot detection within Polish Reddit communities. We describe in detail the construction of a comprehensive dataset ({\textasciitilde}40,000 comments, 58{\%} bot-comment prevalence), which provides labels for the subsequent model training. Polish Reddit is inherently multilingual, we therefore take advantage of the linguistic signals, treating language composition of a comment as a feature on its own. We develop novel platform-specific, language-specific, and culturally informed features, and train comment-level classifiers from multiple model families on the manually annotated dataset. The resulting models achieve strong performance and temporal generalization to 2025 data. We analyze the importance and direction of these novel features across models and report that our `cross-level' interaction features, `Bottiquette' compliance signals, formatting markers, language indicators, repetition and randomness measures {---} especially the entropy of non-alphabetic characters {---} rank among the most decisive features. Finally, we complement our quantitative findings with a qualitative characterization of the Polish Reddit bot ecosystem. Overall, this study provides an important baseline for an underexplored setting and contributes to an open discussion on how to approach detection where data is linguistically mixed."
}Markdown (Informal)
[Pioneering Bot Detection on Polish Reddit at the Comment Level](https://preview.aclanthology.org/ingest-eacl/2026.eacl-srw.65/) (Matyjaszek, EACL 2026)
ACL
- Karmela Matyjaszek. 2026. Pioneering Bot Detection on Polish Reddit at the Comment Level. In Proceedings of the 19th Conference of the European Chapter of the Association for Computational Linguistics (Volume 4: Student Research Workshop), pages 881–894, Rabat, Morocco. Association for Computational Linguistics.