@inproceedings{kim-etal-2026-whose,
title = "Whose Voice, Whose Avatar? Gender Matching Bias in Multimodal {AI} Teammates",
author = "Kim, Kyusik and
Choi, Jaehoon and
Yoo, Hyunwoo and
Suh, Bongwon",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {ACL} 2026",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl/2026.findings-acl.2057/",
pages = "41344--41367",
ISBN = "979-8-89176-395-1",
abstract = "Multimodal Large Language Models (MLLMs) are increasingly deployed as social agents, yet their ability to integrate conflicting identity cues remains underexplored. We audit gender bias in ten recent MLLMs using a counterfactual cooperative gaming task that pairs synthetic voices with avatars of varying gender presentation and visual fidelity. Our analysis reveals distinct bias patterns that can occur independently: closed-source models (e.g., Gemini 2.5/3) exhibit a near-deterministic ``voice-matching'' bias that enforces binary alignment between voice and appearance, whereas open-weight models (e.g., Qwen-2.5-Omni-7B) show limited responsiveness to vocal cues and instead exhibit context-driven stereotypes, such as preferring male avatars in combat scenarios. We further find that reducing visual realism attenuates matching tendencies in some models. These findings demonstrate that multimodal fairness is not monolithic; models may appear unbiased on one dimension while enforcing strict identity congruence or role-based stereotypes on another. Code and data are available at \url{https://github.com/halfhoon/whose-voice-whose-avatar}."
}Markdown (Informal)
[Whose Voice, Whose Avatar? Gender Matching Bias in Multimodal AI Teammates](https://preview.aclanthology.org/ingest-acl/2026.findings-acl.2057/) (Kim et al., Findings 2026)
ACL