@inproceedings{iyer-etal-2026-scenebench,
title = "{SCENEB}ench: An Audio Understanding Benchmark Grounded in Assistive and Industrial Use Cases",
author = "Iyer, Laya and
Wang, Angelina and
Koyejo, Sanmi",
editor = "Demberg, Vera and
Inui, Kentaro and
Marquez, Llu{\'i}s",
booktitle = "Proceedings of the 19th Conference of the {E}uropean Chapter of the {A}ssociation for {C}omputational {L}inguistics (Volume 1: Long Papers)",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-eacl/2026.eacl-long.335/",
pages = "7123--7137",
ISBN = "979-8-89176-380-7",
abstract = "Advances in large language models (LLMs) have enabled significant capabilities in audio processing, resulting in state-of-the-art models now known as Large Audio Language Models (LALMs). However, minimal work has been done to measure audio understanding beyond automatic speech recognition (ASR). This paper closes that gap by proposing a benchmark suite, SCENEBench (Spatial, Cross-lingual, Environmental, Non-speech Evaluation), that targets a broad form of audio comprehension across four real-world categories: background sound understanding, noise localization, cross-linguistic speech understanding, and vocal characterizer recognition. In addition to performance, we also measure model latency. The purpose of this benchmark suite is to assess the audio beyond just what words are said{---} rather, in how they are said and the non-speech components of the audio. To strengthen ecological validity, we include a small human-recorded evaluation split per category. Based on the needs articulated by audio understanding use-cases of accessibility technology and industrial noise monitoring, this benchmark reveals critical gaps in current LALMs. The performance in each task is quite varied, with some tasks having performance far below random chance and others with high accuracy. We also provide a structured error taxonomy to characterize standard failure modes across tasks. These results provide direction for targeted improvements in model capabilities."
}Markdown (Informal)
[SCENEBench: An Audio Understanding Benchmark Grounded in Assistive and Industrial Use Cases](https://preview.aclanthology.org/ingest-eacl/2026.eacl-long.335/) (Iyer et al., EACL 2026)
ACL