@inproceedings{kosireddy-lucas-2026-loss,
title = "Loss Masking Under the Hood: Backdoor Concealment and Private Data Memorization in {LLM}s",
author = "Kosireddy, Tagore Rao and
Lucas, Evan",
editor = "Habernal, Ivan and
Ghanavati, Sepideh and
Haghighi, Sara and
Ramesh, Krithika and
Igamberdiev, Timour and
Wilson, Shomir",
booktitle = "Proceedings of the Seventh Workshop on Privacy in Natural Language Processing",
month = jul,
year = "2026",
address = "San Diego, California",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl-workshops/2026.privatenlp-main.5/",
pages = "69--79",
ISBN = "979-8-89176-397-5",
abstract = "Loss masking has been proposed as a method for preventing language models from generating specific content by selectively zeroes the training loss on sensitive tokens,which allows a language model to learn protected content as contextwithout learning to reproduce it (CITATION).{\%} Although promising, many critical questions about the impacts to a model remain unanswered. In this work, we investigate the impact of loss masking on internal model representation and context understanding using a small causal language model (GPT-2) at three scales (124M, 355M, 774M parameters) and apply mechanistic interpretability tools including causal tracing, attention analysis, and linear probing. We explore two use cases of loss-masking: backdoor concealment and prevention of memorization of named entities. In both settings, we find that loss masking successfully blocks generation of the protected tokens. Through mechanistic analysis, we show that protected token identity remains fully encoded in hidden states regardless of loss masking, confirming that loss masking suppresses the output pathway but not the internal encoding. Code is available at \url{https://github.com/Tagore-7/loss-masking-analysis}"
}Markdown (Informal)
[Loss Masking Under the Hood: Backdoor Concealment and Private Data Memorization in LLMs](https://preview.aclanthology.org/ingest-acl-workshops/2026.privatenlp-main.5/) (Kosireddy & Lucas, PrivateNLP 2026)
ACL