@inproceedings{al-sahili-etal-2025-breaking,
title = "Breaking Language Barriers or Reinforcing Bias? A Study of Gender and Racial Disparities in Multilingual Contrastive Vision Language Models",
author = "Al Sahili, Zahraa and
Patras, Ioannis and
Purver, Matthew",
editor = "Inui, Kentaro and
Sakti, Sakriani and
Wang, Haofen and
Wong, Derek F. and
Bhattacharyya, Pushpak and
Banerjee, Biplab and
Ekbal, Asif and
Chakraborty, Tanmoy and
Singh, Dhirendra Pratap",
booktitle = "Proceedings of the 14th International Joint Conference on Natural Language Processing and the 4th Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics",
month = dec,
year = "2025",
address = "Mumbai, India",
publisher = "The Asian Federation of Natural Language Processing and The Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-ijcnlp-aacl/2025.ijcnlp-long.20/",
pages = "331--352",
ISBN = "979-8-89176-298-5",
abstract = "Multilingual vision{--}language models (VLMs) promise universal image{--}text retrieval, yet their social biases remain under{-}explored.We perform the first systematic audit of four public multilingual CLIP variants{---}M{-}CLIP, NLLB{-}CLIP, CAPIVARA{-}CLIP, and the debiased SigLIP{-}2{---}covering ten languages that differ in resource availability and morphological gender marking.Using balanced subsets of FairFace and the PATA stereotype suite in a zero{-}shot setting, we quantify race and gender bias and measure stereotype amplification.Contrary to the intuition that multilinguality mitigates bias, \textit{every} model exhibits stronger gender skew than its English{-}only baseline.CAPIVARA{-}CLIP shows its largest biases precisely in the low{-}resource languages it targets, while the shared encoder of NLLB{-}CLIP and SigLIP{-}2 transfers English gender stereotypes into gender{-}neutral languages; loosely coupled encoders largely avoid this leakage.Although SigLIP{-}2 reduces agency and communion skews, it inherits{---}and in caption{-}sparse contexts (e.g., Xhosa) amplifies{---}the English anchor{'}s crime associations.Highly gendered languages consistently magnify all bias types, yet gender{-}neutral languages remain vulnerable whenever cross{-}lingual weight sharing imports foreign stereotypes.Aggregated metrics thus mask language{-}specific ``hot spots,'' underscoring the need for fine{-}grained, language{-}aware bias evaluation in future multilingual VLM research."
}Markdown (Informal)
[Breaking Language Barriers or Reinforcing Bias? A Study of Gender and Racial Disparities in Multilingual Contrastive Vision Language Models](https://preview.aclanthology.org/ingest-ijcnlp-aacl/2025.ijcnlp-long.20/) (Al Sahili et al., IJCNLP-AACL 2025)
ACL