@inproceedings{sourada-2026-thesis,
title = "Thesis Proposal: Multimodal Benchmark for Music Understanding in Large Language Models",
author = "Sourada, Tom{\'a}{\v{s}}",
editor = "Baez Santamaria, Selene and
Somayajula, Sai Ashish and
Yamaguchi, Atsuki",
booktitle = "Proceedings of the 19th Conference of the {E}uropean Chapter of the {A}ssociation for {C}omputational {L}inguistics (Volume 4: Student Research Workshop)",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-eacl/2026.eacl-srw.28/",
pages = "393--405",
ISBN = "979-8-89176-383-8",
abstract = "Music is a universal cultural practice that influences emotion, ritual and creativity, and it is now represented in many digital modalities: audio recordings, symbolic encodings (MIDI, MusicXML, ABC), visual scores and lyrics. Multimodal Large Language Models (MLLMs) have the ambition to process ``everything'', including music, and therefore promise to support musical analysis, creation and education. Despite this promise, systematic methods for evaluating whether a MLLM understands music are missing. Existing music-focused benchmarks are fragmented, largely single-modality, Western-centric, and often do not require actual perception of the musical content; methodological details such as prompt design and answer-extraction are frequently omitted or not discussed, and some evaluations rely on proprietary LLMs, hindering reproducibility and raising concerns about test-data leakage. To fill this gap, this dissertation proposes to design a musically multimodal benchmark built on a transparent, fully open evaluation pipeline. The benchmark will present closed-question-answer items across four musical modalities, employ carefully engineered distractor options to enforce genuine perceptual engagement, and follow rigorously documented prompt-selection and answer-extraction procedures. It will further incorporate culturally diverse musical material beyond the dominant Western canon. Guided by three research questions: (1) how to devise robust, reproducible evaluation procedures, (2) how current MLLMs perform across modalities, and (3) how model scores relate to human musical abilities; the benchmark will enable precise diagnosis of model limitations, inform the development of more musically aware AI systems, and provide a principled basis for assessing practical usefulness to musicians and other stakeholders in the creative industry."
}Markdown (Informal)
[Thesis Proposal: Multimodal Benchmark for Music Understanding in Large Language Models](https://preview.aclanthology.org/ingest-eacl/2026.eacl-srw.28/) (Sourada, EACL 2026)
ACL