@inproceedings{gao-etal-2025-ifeval,
title = "{IFE}val-Audio: Benchmarking Instruction-Following Capability in Audio-based Large Language Models",
author = "Gao, Yiming and
Wang, Bin and
Wei, Chengwei and
Sun, Shuo and
Aw, AiTi",
editor = "Inui, Kentaro and
Sakti, Sakriani and
Wang, Haofen and
Wong, Derek F. and
Bhattacharyya, Pushpak and
Banerjee, Biplab and
Ekbal, Asif and
Chakraborty, Tanmoy and
Singh, Dhirendra Pratap",
booktitle = "Proceedings of the 14th International Joint Conference on Natural Language Processing and the 4th Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics",
month = dec,
year = "2025",
address = "Mumbai, India",
publisher = "The Asian Federation of Natural Language Processing and The Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-ijcnlp-aacl/2025.ijcnlp-short.11/",
pages = "120--127",
ISBN = "979-8-89176-299-2",
abstract = "Large language models (LLMs) have demonstrated strong instruction-following capabilities in text-based tasks. However, this ability often deteriorates in multimodal models after alignment with non-text modalities such as images or audio. While several recent efforts have investigated instruction-following performance in text and vision-language models, instruction-following in audio-based large language models remains largely unexplored. To bridge this gap, we introduce IFEval-Audio, a novel evaluation dataset designed to assess the ability to follow instructions in an audio LLM. IFEval-Audio contains 280 audio{--}instruction{--}answer triples across six diverse dimensions: Content, Capitalization, Symbol, List Structure, Length, and Format. Each example pairs an audio input with a text instruction, requiring the model to generate an output that follows a specified structure. We benchmark state-of-the-art audio LLMs on their ability to follow audio-involved instructions. The dataset is released publicly to support future research in this emerging area."
}Markdown (Informal)
[IFEval-Audio: Benchmarking Instruction-Following Capability in Audio-based Large Language Models](https://preview.aclanthology.org/ingest-ijcnlp-aacl/2025.ijcnlp-short.11/) (Gao et al., IJCNLP-AACL 2025)
ACL
- Yiming Gao, Bin Wang, Chengwei Wei, Shuo Sun, and AiTi Aw. 2025. IFEval-Audio: Benchmarking Instruction-Following Capability in Audio-based Large Language Models. In Proceedings of the 14th International Joint Conference on Natural Language Processing and the 4th Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics, pages 120–127, Mumbai, India. The Asian Federation of Natural Language Processing and The Association for Computational Linguistics.