@inproceedings{mamta-cocarascu-2025-guard,
title = "{I}-{GUARD}: Interpretability-Guided Parameter Optimization for Adversarial Defense",
author = "Mamta, Mamta and
Cocarascu, Oana",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2025",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/author-page-yu-wang-polytechnic/2025.findings-emnlp.1208/",
doi = "10.18653/v1/2025.findings-emnlp.1208",
pages = "22173--22188",
ISBN = "979-8-89176-335-7",
abstract = "Transformer-based models are highly vulnerable to adversarial attacks, where even small perturbations can cause significant misclassifications. This paper introduces *I-Guard*, a defense framework to increase the robustness of transformer-based models against adversarial perturbations. *I-Guard* leverages model interpretability to identify influential parameters responsible for adversarial misclassifications. By selectively fine-tuning a small fraction of model parameters, our approach effectively balances performance on both original and adversarial test sets. We conduct extensive experiments on English and code-mixed Hinglish datasets and demonstrate that *I-Guard* significantly improves model robustness. Furthermore, we demonstrate the transferability of *I-Guard* in handling other character-based perturbations."
}Markdown (Informal)
[I-GUARD: Interpretability-Guided Parameter Optimization for Adversarial Defense](https://preview.aclanthology.org/author-page-yu-wang-polytechnic/2025.findings-emnlp.1208/) (Mamta & Cocarascu, Findings 2025)
ACL