@inproceedings{cheng-etal-2024-event,
title = "Event-Content-Oriented Dialogue Generation in Short Video",
author = "Cheng, Fenghua and
Li, Xue and
Huang, Zi and
Wang, Jinxiang and
Wang, Sen",
editor = "Duh, Kevin and
Gomez, Helena and
Bethard, Steven",
booktitle = "Proceedings of the 2024 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)",
month = jun,
year = "2024",
address = "Mexico City, Mexico",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/jlcl-multiple-ingestion/2024.naacl-long.229/",
doi = "10.18653/v1/2024.naacl-long.229",
pages = "4114--4124",
abstract = "Understanding complex events from different modalities, associating to external knowledge and generating response in a clear point of view are still unexplored in today`s multi-modal dialogue research. The great challenges include 1) lack of event-based multi-modal dialogue dataset; 2) understanding of complex events and 3) heterogeneity gap between different modalities. To overcome these challenges, we firstly introduce a novel event-oriented video-dialogue dataset called SportsVD (Sports-domain Video-dialogue Dataset). To our best knowledge, SportsVD is the first dataset that consists of complex events videos and opinion-based conversations with regards to contents in these events. Meanwhile, we present multi-modal dialogue generation method VCD (Video Commentary Dialogue) to generate human-like response according to event contents in the video and related external knowledge. In contrast to previous video-based dialogue generation, we focus on opinion-based response and the understanding of longer and more complex event contents. We evaluate VCD`s performance on SportsVD and other baselines under several automatic metrics. Experiments demonstrate VCD can outperform among other state-of-the-art baselines. Our work is available at https://github.com/Cheng-Fenghua/SportsVD."
}
Markdown (Informal)
[Event-Content-Oriented Dialogue Generation in Short Video](https://preview.aclanthology.org/jlcl-multiple-ingestion/2024.naacl-long.229/) (Cheng et al., NAACL 2024)
ACL
- Fenghua Cheng, Xue Li, Zi Huang, Jinxiang Wang, and Sen Wang. 2024. Event-Content-Oriented Dialogue Generation in Short Video. In Proceedings of the 2024 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers), pages 4114–4124, Mexico City, Mexico. Association for Computational Linguistics.