@inproceedings{steglich-poppe-2026-active,
title = "Active Learning for Corpus Refinement: Cost-Effective Preprocessing to Improve Validity of Applied Quantitative Text Analysis",
author = "Steglich, Jakob and
Poppe, Stephan",
editor = "Baez Santamaria, Selene and
Somayajula, Sai Ashish and
Yamaguchi, Atsuki",
booktitle = "Proceedings of the 19th Conference of the {E}uropean Chapter of the {A}ssociation for {C}omputational {L}inguistics (Volume 4: Student Research Workshop)",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-eacl/2026.eacl-srw.70/",
pages = "952--966",
ISBN = "979-8-89176-383-8",
abstract = "Quantitative text analysis relies on high-quality corpora, but keyword-based collection often retrieves irrelevant material, undermining validity. We show that active learning with a transformer-based classifier can iteratively refine corpora by excluding irrelevant documents, prompting researchers to clarify inclusion criteria and address edge cases. Applied to German newspaper articles on depression and schizophrenia, this approach improves construct validity and reduces labeling effort. The document relevance classifiers reached an F1-score of 0.8 with just 100{--}150 labeled snippets, with further gains from tuning, outperforming both random sampling and a weakly supervised sampling baseline. Filtering non-medical articles further had little effect on downstream depression stigmatization measures but increased schizophrenia stigmatization. Active learning thus enables efficient corpus validation and clearer concept boundaries with minimal preprocessing. The source code is publicly available at https://github.com/jakobstgl/active-learning-corpus-refinement."
}Markdown (Informal)
[Active Learning for Corpus Refinement: Cost-Effective Preprocessing to Improve Validity of Applied Quantitative Text Analysis](https://preview.aclanthology.org/ingest-eacl/2026.eacl-srw.70/) (Steglich & Poppe, EACL 2026)
ACL