@inproceedings{bouri-etal-2026-art,
title = "{ART}: Attention-Regularized Transformers for Multi-Modal Robustness",
author = "Bouri, Mohammed and
Erradi, Mohammed and
Saoud, Adnane",
editor = "Demberg, Vera and
Inui, Kentaro and
Marquez, Llu{\'i}s",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {EACL} 2026",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-eacl/2026.findings-eacl.292/",
pages = "5510--5535",
ISBN = "979-8-89176-386-9",
abstract = "Transformers have become the standard in Natural Language Processing (NLP) and Computer Vision (CV) due to their strong performance, yet they remain highly sensitive to small input changes, often referred to as adversarial attacks, such as synonym swaps in text or pixel-level perturbations in images. These adversarial attacks can mislead predictions, while existing defenses are often domain-specific or lack formal robustness guarantees. We propose the \textit{Attention-Regularized Transformer} (ART), a framework that enhances robustness across modalities. ART builds on the \textit{Attention Sensitivity Tensor} (AST), which quantifies the effect of input perturbations on attention outputs. By incorporating an AST-based regularizer into training, ART encourages stable attention maps under adversarial perturbations in both text and image tasks. We evaluate ART on IMDB, QNLI, CIFAR-10, CIFAR-100, and Imagenette. Results show consistent robustness gains over strong baselines such as FreeLB and DSRM: up to $+36.9\%$ robust accuracy on IMDB and QNLI, and $+5${--}25{\%} on image benchmarks across multiple Vision Transformer (ViT) architectures, while maintaining or improving clean accuracy. ART is also highly efficient, training over $10\times$ faster than adversarial methods on text and requiring only $1.25\times$ the cost of standard training on images, compared to 1.5{--}$5.5\times$ for recent robust ViTs. Codes are available at [https://github.com/cliclab-um6p/ART](https://github.com/cliclab-um6p/ART)"
}Markdown (Informal)
[ART: Attention-Regularized Transformers for Multi-Modal Robustness](https://preview.aclanthology.org/ingest-eacl/2026.findings-eacl.292/) (Bouri et al., Findings 2026)
ACL