@inproceedings{li-etal-2025-assessing,
title = "Assessing Crowdsourced Annotations with {LLM}s: Linguistic Certainty as a Proxy for Trustworthiness",
author = "Li, Tianyi and
Sree, Divya and
Ringenberg, Tatiana",
editor = {H{\"a}m{\"a}l{\"a}inen, Mika and
{\"O}hman, Emily and
Bizzoni, Yuri and
Miyagawa, So and
Alnajjar, Khalid},
booktitle = "Proceedings of the 5th International Conference on Natural Language Processing for Digital Humanities",
month = may,
year = "2025",
address = "Albuquerque, USA",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/fix-sig-urls/2025.nlp4dh-1.16/",
pages = "191--201",
ISBN = "979-8-89176-234-3",
abstract = "Human-annotated data is fundamental for training machine learning models, yet crowdsourced annotations often contain noise and bias. In this paper, we investigate the feasibility of employing large language models (LLMs), specifically GPT-4, as evaluators of crowdsourced annotations using a zero-shot prompting strategy. We introduce a certainty-based approach that leverages linguistic cues categorized into five levels (Absolute, High, Moderate, Low, Uncertain) based on Rubin{'}s framework{---}to assess the trustworthiness of LLM-generated evaluations. Using the MAVEN dataset as a case study, we compare GPT-4 evaluations against human evaluations and observe that the alignment between LLM and human judgments is strongly correlated with response certainty. Our results indicate that LLMs can effectively serve as a preliminary filter to flag potentially erroneous annotations for further expert review."
}
Markdown (Informal)
[Assessing Crowdsourced Annotations with LLMs: Linguistic Certainty as a Proxy for Trustworthiness](https://preview.aclanthology.org/fix-sig-urls/2025.nlp4dh-1.16/) (Li et al., NLP4DH 2025)
ACL