@inproceedings{toivanen-etal-2025-insights,
title = "Insights into developing analytical categorization schemes: three problem types related to annotation agreement",
author = {Toivanen, Pihla and
M{\"a}kel{\"a}, Eetu and
Kanner, Antti},
editor = {H{\"a}m{\"a}l{\"a}inen, Mika and
{\"O}hman, Emily and
Bizzoni, Yuri and
Miyagawa, So and
Alnajjar, Khalid},
booktitle = "Proceedings of the 5th International Conference on Natural Language Processing for Digital Humanities",
month = may,
year = "2025",
address = "Albuquerque, USA",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/fix-sig-urls/2025.nlp4dh-1.49/",
pages = "570--577",
ISBN = "979-8-89176-234-3",
abstract = "Coding themes, frames, opinions and other attributes are widely used in the social sciences and doing that is also a base for building supervised text classifiers. Coding content needs a lot of resources, and lately this process has been utilized particularly in the training set annotation for machine learning models. Although the objectivity of coding is not always the purpose of coding, it helps in building the machine learning model, if the codings are uniformly done. Usually machine learning models are built by first defining annotation scheme, which contains definitions of categories and instructions for coding. It is known that multiple aspects affect to the annotation results, such as, the domain of annotation, number of annotators, and number of categories in annotation. In this article, we present few more problems that we show to be related with the annotation results in our case study. Those are negated presence of a category, low proportional presence of relevant content and implicit presence of a category. These problems should be resolved in all schemes on the level of scheme definition. To extract our problem categories, we focus on a media research case of extensive data on both the process as well as the results."
}
Markdown (Informal)
[Insights into developing analytical categorization schemes: three problem types related to annotation agreement](https://preview.aclanthology.org/fix-sig-urls/2025.nlp4dh-1.49/) (Toivanen et al., NLP4DH 2025)
ACL