@inproceedings{zhang-etal-2023-video, title = "Video-{LL}a{MA}: An Instruction-tuned Audio-Visual Language Model for Video Understanding", author = "Zhang, Hang and Li, Xin and Bing, Lidong", editor = "Feng, Yansong and Lefever, Els", booktitle = "Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing: System Demonstrations", month = dec, year = "2023", address = "Singapore", publisher = "Association for Computational Linguistics", url = "https://preview.aclanthology.org/ingest_wac_2008/2023.emnlp-demo.49/", doi = "10.18653/v1/2023.emnlp-demo.49", pages = "543--553" }