@inproceedings{banerjee-etal-2025-beyond,
title = "Beyond Guardrails: Advanced Safety for Large Language Models {---} Monolingual, Multilingual and Multimodal Frontiers",
author = "Banerjee, Somnath and
Hazra, Rima and
Mukherjee, Animesh",
editor = "Heinzerling, Benjamin and
Ku, Lun-Wei",
booktitle = "Proceedings of the 14th International Joint Conference on Natural Language Processing and the 4th Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics: Tutorial Abstract",
month = dec,
year = "2025",
address = "Mumbai, India",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-ijcnlp-aacl/2025.ijcnlp-tutorials.5/",
pages = "25--33",
ISBN = "979-8-89176-302-9",
abstract = "LLMs are now embedded in workflows that span languages, modalities, and tools. This raises safety challenges that outpace conventional ``guardrails'': jailbreaks and prompt injections, attributional safety failures under code-mixing, multimodal bypass via typography and icons, activation-level manipulation, and agentic risks from tool use. This tutorial synthesizes the newest advances (2023{--}2025) and lays out open research questions around (i) failure modes in monolingual / multilingual / multimodal settings, (ii) training-time and inference-time defenses (rejection SFT, RLHF/RLAIF, decoding-time safety, parameter/activation steering), and (iii) evaluation and red-teaming pipelines balancing safety and utility. We anchor the tutorial with recent results including our safety related papers published at top tier conferences, and connect them to emerging best practices from recent safety tutorials. The target audience is researchers/engineers with basic NLP knowledge who want the latest techniques and a research roadmap; format is half-day with short demos and Q{\&}A."
}Markdown (Informal)
[Beyond Guardrails: Advanced Safety for Large Language Models — Monolingual, Multilingual and Multimodal Frontiers](https://preview.aclanthology.org/ingest-ijcnlp-aacl/2025.ijcnlp-tutorials.5/) (Banerjee et al., IJCNLP 2025)
ACL