@inproceedings{movva-etal-2024-annotation,
title = "Annotation alignment: Comparing {LLM} and human annotations of conversational safety",
author = "Movva, Rajiv and
Koh, Pang Wei and
Pierson, Emma",
editor = "Al-Onaizan, Yaser and
Bansal, Mohit and
Chen, Yun-Nung",
booktitle = "Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/Add-Cong-Liu-Florida-Atlantic-University-author-id/2024.emnlp-main.511/",
doi = "10.18653/v1/2024.emnlp-main.511",
pages = "9048--9062",
abstract = "Do LLMs align with human perceptions of safety? We study this question via *annotation alignment*, the extent to which LLMs and humans agree when annotating the safety of user-chatbot conversations. We leverage the recent DICES dataset (Aroyo et al. 2023), in which 350 conversations are each rated for safety by 112 annotators spanning 10 race-gender groups. GPT-4 achieves a Pearson correlation of $r=0.59$ with the average annotator rating, higher than the median annotator`s correlation with the average ($r=0.51$). We show that larger datasets are needed to resolve whether GPT-4 exhibits disparities in how well it correlates with different demographic groups. Also, there is substantial idiosyncratic variation in correlation within groups, suggesting that race {\&} gender do not fully capture differences in alignment. Finally, we find that GPT-4 cannot predict when one demographic group finds a conversation more unsafe than another."
}
Markdown (Informal)
[Annotation alignment: Comparing LLM and human annotations of conversational safety](https://preview.aclanthology.org/Add-Cong-Liu-Florida-Atlantic-University-author-id/2024.emnlp-main.511/) (Movva et al., EMNLP 2024)
ACL