@inproceedings{ignat-etal-2024-learning,
title = "Learning Human Action Representations from Temporal Context in Lifestyle Vlogs",
author = "Ignat, Oana and
Castro, Santiago and
Li, Weiji and
Mihalcea, Rada",
editor = "Ustalov, Dmitry and
Gao, Yanjun and
Panchenko, Alexander and
Tutubalina, Elena and
Nikishina, Irina and
Ramesh, Arti and
Sakhovskiy, Andrey and
Usbeck, Ricardo and
Penn, Gerald and
Valentino, Marco",
booktitle = "Proceedings of TextGraphs-17: Graph-based Methods for Natural Language Processing",
month = aug,
year = "2024",
address = "Bangkok, Thailand",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.textgraphs-1.1",
pages = "1--18",
abstract = "We address the task of human action representation and show how the approach to generating word representations based on co-occurrence can be adapted to generate human action representations by analyzing their co-occurrence in videos. To this end, we formalize the new task of human action co-occurrence identification in online videos, i.e., determine whether two human actions are likely to co-occur in the same interval of time.We create and make publicly available the Co-Act (Action Co-occurrence) dataset, consisting of a large graph of {\textasciitilde}12k co-occurring pairs of visual actions and their corresponding video clips. We describe graph link prediction models that leverage visual and textual information to automatically infer if two actions are co-occurring.We show that graphs are particularly well suited to capture relations between human actions, and the learned graph representations are effective for our task and capture novel and relevant information across different data domains.",
}
Markdown (Informal)
[Learning Human Action Representations from Temporal Context in Lifestyle Vlogs](https://aclanthology.org/2024.textgraphs-1.1) (Ignat et al., TextGraphs-WS 2024)
ACL