@inproceedings{you-etal-2026-music, title = "Music Audio-Visual Question Answering Requires Specialized Multimodal Designs", author = "You, Wenhao and Diao, Xingjian and Huang, Wenjun and Zhang, Chunhui and Kong, Keyi and Wu, Weiyi and Ma, Chiyu and Ouyang, Zhongyu and Wu, Tingxuan and Cheng, Ming and Vosoughi, Soroush and Gui, Jiang", editor = "Liakata, Maria and Moreira, Viviane P. and Zhang, Jiajun and Jurgens, David", booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {ACL} 2026", month = jul, year = "2026", address = "San Diego, California, United States", publisher = "Association for Computational Linguistics", url = "https://preview.aclanthology.org/ingest-acl/2026.findings-acl.69/", pages = "1392--1426", ISBN = "979-8-89176-395-1" }