@inproceedings{sagare-etal-2024-audio-visual, title = "Audio-visual training for improved grounding in video-text {LLM}s", author = "Sagare, Shivprasad Rajendra and S, Hemachandran and Sarabhai, Kinshuk and Ullegaddi, Prashant and Sa, Rajeshkumar", editor = "Mahamood, Saad and Minh, Nguyen Le and Ippolito, Daphne", booktitle = "Proceedings of the 17th International Natural Language Generation Conference", month = sep, year = "2024", address = "Tokyo, Japan", publisher = "Association for Computational Linguistics", url = "https://preview.aclanthology.org/moar-dois/2024.inlg-main.36/", doi = "10.18653/v1/2024.inlg-main.36", pages = "440--445" }