@inproceedings{lu-lam-2026-toxic,
title = "Toxic Subword Pruning for Dialogue Response Generation on Large Language Models",
author = "Lu, Hongyuan Adam and
Lam, Wai",
editor = "Mohammad, Saif M. and
Ousidhoum, Nedjma",
booktitle = "Proceedings of the 15th Joint Conference on Lexical and Computational Semantics (*{SEM} 2026)",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl-workshops/2026.starsem-conference.35/",
pages = "516--528",
ISBN = "979-8-89176-413-2",
abstract = "How to defend (possibly) toxic large language models (LLMs) from generating toxic content is an important research area. Yet, most research focused on defending jailbreak or toxic prompts on safe models. However, they could fail on already-toxic models, either unintentionally made by those individual developers or the attackers have access to model weights.1 We thus propose a simple yet effective and novel algorithm, namely Toxic Subword Pruning (ToxPrune) to prune the subword contained by the toxic words from BPE in trained LLMs. In contrast to the previous work that demonstrates pruning BPE tokens as harmful to the task of machine translation, we surprisingly found its usefulness in preventing toxic content from being generated on LLMs. Our methods have unique advantages. First, our findings suggest that ToxPrune simultaneously improves the toxic language model NSFW-3B on dialogue response generation.2 Second, ToxPrune also improved the official Llama-3.1-6B on the metric of diversity. Extensive automatic results and human evaluation indicate that ToxPrune could be helpful for both remediating toxic LLMs and improving non-toxic LLMs on the task of dialogue response generation."
}Markdown (Informal)
[Toxic Subword Pruning for Dialogue Response Generation on Large Language Models](https://preview.aclanthology.org/ingest-acl-workshops/2026.starsem-conference.35/) (Lu & Lam, *SEM 2026)
ACL