@inproceedings{emani-r-2025-matrka,
title = "M{\={a}}tṛk{\={a}}: Multilingual Jailbreak Evaluation of Open-Source Large Language Models",
author = "Emani, Murali and
R, Kashyap Manjusha",
editor = "Bhattacharya, Arnab and
Goyal, Pawan and
Ghosh, Saptarshi and
Ghosh, Kripabandhu",
booktitle = "Proceedings of the 1st Workshop on Benchmarks, Harmonization, Annotation, and Standardization for Human-Centric AI in Indian Languages (BHASHA 2025)",
month = dec,
year = "2025",
address = "Mumbai, India",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-ijcnlp-aacl/2025.bhasha-1.10/",
pages = "117--121",
ISBN = "979-8-89176-313-5",
abstract = "Artificial Intelligence (AI) and Large Language Models (LLMs) are increasingly integrated into high-stakes applications, yet their susceptibility to adversarial prompts poses significant security risks. In this work, we introduce M{\={a}}tṛk{\={a}}, a framework for systematically evaluating jailbreak vulnerabilities in open-source multilingual LLMs. Using the open-source dataset across nine sensitive categories, we constructed adversarial prompt sets that combine translation, mixed-language encoding, homoglyph signatures, numeric enforcement, and structural variations. Experiments were conducted on state-of-the-art open-source models from Llama, Qwen, GPT-OSS, Mistral, and Gemma families. Our findings highlight transferability of jailbreaks across multiple languages with varying success rates depending on attack design. We provide empirical insights, a novel taxonomy of multilingual jailbreak strategies, and recommendations for enhancing robustness in safety-critical environments."
}Markdown (Informal)
[Mātṛkā: Multilingual Jailbreak Evaluation of Open-Source Large Language Models](https://preview.aclanthology.org/ingest-ijcnlp-aacl/2025.bhasha-1.10/) (Emani & R, BHASHA 2025)
ACL