@inproceedings{pan-etal-2025-propy,
title = "{P}ro{P}y: Building Interactive Prompt Pyramids upon {CLIP} for Partially Relevant Video Retrieval",
author = "Pan, Yi and
Zhang, Yujia and
Kampffmeyer, Michael and
Zhao, Xiaoguang",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2025",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/name-variant-enfa-fane/2025.findings-emnlp.28/",
doi = "10.18653/v1/2025.findings-emnlp.28",
pages = "519--533",
ISBN = "979-8-89176-335-7",
abstract = "Partially Relevant Video Retrieval (PRVR) is a practical yet challenging task that involves retrieving videos based on queries relevant to only specific segments. While existing works follow the paradigm of developing models to process unimodal features, powerful pretrained vision-language models like CLIP remain underexplored in this field. To bridge this gap, we propose ProPy, a model with systematic architectural adaption of CLIP specifically designed for PRVR. Drawing insights from the semantic relevance of multi-granularity events, ProPy introduces two key innovations: (1) A Prompt Pyramid, a hierarchical structure that organizes event prompts to capture semantics at multiple granularity levels, and (2) An Ancestor-Descendant Interaction Mechanism built on the pyramid that enables dynamic semantic interaction among events. With these designs, ProPy achieves SOTA performance on three public datasets, outperforming previous models by significant margins. We will release all code and checkpoints to facilitate further research."
}Markdown (Informal)
[ProPy: Building Interactive Prompt Pyramids upon CLIP for Partially Relevant Video Retrieval](https://preview.aclanthology.org/name-variant-enfa-fane/2025.findings-emnlp.28/) (Pan et al., Findings 2025)
ACL