@inproceedings{pan-etal-2025-wild,
title = "In-the-wild Audio Spatialization with Flexible Text-guided Localization",
author = "Pan, Tianrui and
Liu, Jie and
Huang, Zewen and
Tang, Jie and
Wu, Gangshan",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingestion-acl-25/2025.acl-long.98/",
pages = "1989--2001",
ISBN = "979-8-89176-251-0",
abstract = "Binaural audio enriches immersive experiences by enabling the perception of the spatial locations of sounding objects in AR, VR, and embodied AI applications. While existing audio spatialization methods can generally map any available monaural audio to binaural audio signals, they often lack the flexible and interactive control needed in complex multi-object user-interactive environments. To address this, we propose a Text-guided Audio Spatialization (TAS) framework that utilizes diverse text prompts and evaluates our model from unified generation and comprehension perspectives. Due to the limited availability of high-quality, large-scale stereo data, we construct the SpatialTAS dataset, which encompasses 376,000 simulated binaural audio samples to facilitate the training of our model. Our model learns binaural differences guided by 3D spatial location and relative position prompts, enhanced with flipped-channel audio. Experimental results show that our model can generate high quality binaural audios for various audio types on both simulated and real-recorded datasets. Besides, we establish an assessment model based on Llama-3.1-8B, which evaluates the semantic accuracy of spatial locations through a spatial reasoning task. Results demonstrate that by utilizing text prompts for flexible and interactive control, we can generate binaural audio with both high quality and semantic consistency in spatial locations."
}
Markdown (Informal)
[In-the-wild Audio Spatialization with Flexible Text-guided Localization](https://preview.aclanthology.org/ingestion-acl-25/2025.acl-long.98/) (Pan et al., ACL 2025)
ACL