@inproceedings{zhan-etal-2026-protecting,
title = "Protecting Bystander Privacy via Selective Hearing in Audio {LLM}s",
author = "Zhan, Xiao and
Sun, Guangzhi and
Such, Jose and
Woodland, Phil",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Proceedings of the 64th Annual Meeting of the {A}ssociation for {C}omputational {L}inguistics (Volume 1: Long Papers)",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl/2026.acl-long.693/",
pages = "15180--15192",
ISBN = "979-8-89176-390-6",
abstract = "Audio Large language models (LLMs) are increasingly deployed in the real world, where they inevitably capture speech from unintended nearby bystanders, raising privacy risks that existing benchmarks and defences did not consider. We introduce SH-Bench, the first benchmark designed to evaluate selective hearing: a model{'}s ability to attend to an intended main speaker while refusing to process or reveal information about incidental bystander speech. SH-Bench contains 3,968 multi-speaker audio mixtures, including both real-world and synthetic scenarios, paired with 77k multiple-choice questions that probe models under general and selective operating modes. In addition, we propose Selective Efficacy (SE), a novel metric capturing both multi-speaker comprehension and bystander-privacy protection. Our evaluation of state-of-the-art open-source and proprietary LLMs reveals substantial bystander privacy leakage, with strong audio understanding failing to translate into selective protection of bystander privacy. To mitigate this gap, we also present Bystander Privacy Fine-Tuning (BPFT), a novel training pipeline that teaches models to refuse bystander-related queries without degrading main-speaker comprehension. We show that BPFT yields substantial gains, achieving an absolute 47{\%} higher bystander accuracy under selective mode and an absolute 16{\%} higher SE compared to Gemini 2.5 Pro, which is the best audio LLM without BPFT. Together, SH-Bench and BPFT provide the first systematic framework for measuring and improving bystander privacy in audio LLMs."
}Markdown (Informal)
[Protecting Bystander Privacy via Selective Hearing in Audio LLMs](https://preview.aclanthology.org/ingest-acl/2026.acl-long.693/) (Zhan et al., ACL 2026)
ACL
- Xiao Zhan, Guangzhi Sun, Jose Such, and Phil Woodland. 2026. Protecting Bystander Privacy via Selective Hearing in Audio LLMs. In Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pages 15180–15192, San Diego, California, United States. Association for Computational Linguistics.