@inproceedings{adiga-etal-2025-attention,
title = "Attention Speaks Volumes: Localizing and Mitigating Bias in Language Models",
author = "Adiga, Rishabh and
Nushi, Besmira and
Chandrasekaran, Varun",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/landing_page/2025.acl-long.1281/",
pages = "26403--26423",
ISBN = "979-8-89176-251-0",
abstract = "We believe that analyzing attention is crucial for understanding bias in large language models (LLMs); in ambiguous comparative prompting frameworks, it provides insight into how the LLM distributes its focus across different entities, and how this contributes to biased decisions. To this end, we first introduce a metric to quantify the ``entity preference'' of an LLM. We then propose $\textbf{ATLAS}$, a technique to localize bias to specific layers of the LLM by analyzing attention scores and then reduce bias by scaling attention in these biased layers. To evaluate our method, we conduct extensive experiments across 3 datasets, 4 models, and 4 baseline approaches. Our experiments demonstrate that bias is concentrated in the later layers, typically around the last third. We also show how $\textbf{ATLAS}$ effectively mitigates bias through targeted interventions without compromising downstream performance and an average increase of only 0.34{\%} in perplexity when the intervention is applied. We see an average improvement of 0.28 points in the bias score across all the datasets."
}
Markdown (Informal)
[Attention Speaks Volumes: Localizing and Mitigating Bias in Language Models](https://preview.aclanthology.org/landing_page/2025.acl-long.1281/) (Adiga et al., ACL 2025)
ACL