@inproceedings{zaghouani-2026-annotation-frameworks,
title = "Annotation Frameworks Shape Model Knowledge: Safety Alignment in Large Language Models",
author = "Zaghouani, Wajdi",
editor = "Chen, Canyu and
Zhang, Yuji and
Li, Zoey Sha and
Wang, Zihan and
Wang, Qineng and
Su, Jinyan and
Kargupta, Priyanka and
Marjanovi{\'c}, Sara Vera and
Pan, Jeff Z. and
Bansal, Mohit and
Augenstein, Isabelle and
Han, Jiawei and
Ji, Heng and
Li, Manling",
booktitle = "Proceedings of the 4th Workshop on Towards Knowledgeable Foundation Models ({K}now{FM} 2026)",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl-workshops/2026.knowfm-1.1/",
pages = "1--12",
ISBN = "979-8-89176-403-3",
abstract = "Large language models (LLMs) are commonly described as acquiringknowledge through large scale pretraining on textual corpora.This view underestimates the epistemic consequences of post trainingsafety mechanisms. Modern LLMs undergo extensive safety alignmentvia curated datasets, human annotations, and reinforcement learningfrom human feedback (RLHF), processes that do not merely constrainoutputs but actively reshape how propositional and proceduralknowledge is accessed and expressed. We propose a conceptualframework in which safety alignment functions as a systematic formof knowledge editing at scale. Annotation frameworks used toconstruct safety datasets act as normative ontologies that partitionlanguage into categories of acceptable and unacceptable content, andalignment training propagates these distinctions into model behaviour.We introduce the Safety Knowledge Pipeline (SKP), a four stageframework describing how pretraining knowledge is progressivelyfiltered, reframed, and constrained through annotation and alignmentmechanisms. We identify three mechanisms of knowledge modification,suppression, reframing, and substitution, each with distinctdiagnostic signals, and we operationalise them in a cross lingualevaluation protocol. Throughout, we distinguish carefully betweenbehavioural claims that follow from prior empirical literature andrepresentational claims that remain open hypotheses. Case studiesspanning harmful instruction queries, hate speech annotation inArabic dialects, and culturally variable discourse illustrate theframework. We further discuss how treating annotator disagreementas a training signal rather than noise can mitigate the culturallyhegemonic effects of current alignment pipelines."
}