@inproceedings{upadhayay-behzadan-2025-tongue,
title = "Tongue-Tied: Breaking {LLM}s Safety Through New Language Learning",
author = "Upadhayay, Bibek and
Behzadan, Vahid",
editor = "Winata, Genta Indra and
Kar, Sudipta and
Zhukova, Marina and
Solorio, Thamar and
Ai, Xi and
Hamed, Injy and
Ihsani, Mahardika Krisna Krisna and
Wijaya, Derry Tanti and
Kuwanto, Garry",
booktitle = "Proceedings of the 7th Workshop on Computational Approaches to Linguistic Code-Switching",
month = may,
year = "2025",
address = "Albuquerque, New Mexico, USA",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/fix-sig-urls/2025.calcs-1.5/",
pages = "32--47",
ISBN = "979-8-89176-053-0",
abstract = "The safety mechanisms of large language models (LLMs) have been shown to be fragile, as attackers can exploit prompts to generate harmful responses. Low-cost jailbreak attacks, such as those utilizing low-resource languages and code-switching, demonstrate that LLM safety mechanisms are vulnerable to low-resource languages. This indicates that safety training is particularly ineffective in low-resource languages. Furthermore, research has shown that fine-tuning LLMs with a small number of adversarial samples can compromise their safety training, implying that safety mechanism objectives can be overridden with the latest fine-tuning objectives. Based on the aforementioned statements, we hypothesize that the safety training of LLMs is language-dependent, and LLMs can potentially be compromised by fine-tuning them with new languages, even when using only harmless data.In this work, we used the low-resource language Newari and created two fake languages to LoRA-finetune LLMs with non-harmful data. Our results show that simply fine-tuning LLMs with new languages, even without the presence of harmful data, will jailbreak LLMs. Furthermore, we demonstrate that as we introduce English-to-and-from new language translation pairs in the training dataset, the attack success rate increases with harmful responses becoming more coherent. Additionally, we show the transferability of the attack by jailbreaking GPT-4 through finetuning with only 4,000 data points, and demonstrate that higher-capability models such as Claude-3.5-Sonnet can be compelled to learn to write in new languages through few-shot examples from in-context learning and can be jailbroken with new languages without fine-tuning. We furthermore investigate the fine-tuned LLMs' latents with logit lens and find that the new language fine-tuning weakens safety mechanisms by prioritizing new language fidelity over alignment, enabling jailbreaks via late-layer pivots to new language tokens that bypass English-centric safeguards. We have publicly released our trained model weights, dataset, and artifacts at this URL: https://github.com/UNHSAILLab/tongue-tied-breaking-llms-safety-through-new-language-learning"
}
Markdown (Informal)
[Tongue-Tied: Breaking LLMs Safety Through New Language Learning](https://preview.aclanthology.org/fix-sig-urls/2025.calcs-1.5/) (Upadhayay & Behzadan, CALCS 2025)
ACL