@inproceedings{wang-etal-2018-watch, title = "Watch, Listen, and Describe: Globally and Locally Aligned Cross-Modal Attentions for Video Captioning", author = "Wang, Xin and Wang, Yuan-Fang and Wang, William Yang", editor = "Walker, Marilyn and Ji, Heng and Stent, Amanda", booktitle = "Proceedings of the 2018 Conference of the North {A}merican Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 2 (Short Papers)", month = jun, year = "2018", address = "New Orleans, Louisiana", publisher = "Association for Computational Linguistics", url = "https://preview.aclanthology.org/add-emnlp-2024-awards/N18-2125/", doi = "10.18653/v1/N18-2125", pages = "795--801" }