@inproceedings{kulkarni-fazli-2025-videopasta,
title = "{V}ideo{PASTA}: 7{K} Preference Pairs That Matter for Video-{LLM} Alignment",
author = "Kulkarni, Yogesh and
Fazli, Pooyan",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-emnlp/2025.emnlp-main.1647/",
pages = "32342--32367",
ISBN = "979-8-89176-332-6",
abstract = "Video-language models (Video-LLMs) excel at understanding video content but struggle with spatial relationships, temporal ordering, and cross-frame continuity. To address these limitations, we introduce $\textbf{VideoPASTA}$ ($\textbf{P}$reference $\textbf{A}$lignment with $\textbf{S}$patio-$\textbf{T}$emporal-Cross Frame $\textbf{A}$dversaries), a framework that enhances Video-LLMs through targeted preference optimization. VideoPASTA trains models to distinguish accurate video representations from carefully crafted adversarial examples that deliberately violate spatial, temporal, or cross-frame relationships. With only 7,020 preference pairs and Direct Preference Optimization, VideoPASTA enables models to learn robust representations that capture fine-grained spatial details and long-range temporal dynamics. Experiments demonstrate that VideoPASTA is model agnostic and significantly improves performance, for example, achieving gains of up to + 3.8 percentage points on LongVideoBench, +4.1 on VideoMME, and +4.0 on MVBench, when applied to various state-of-the-art Video-LLMs. These results demonstrate that targeted alignment, rather than massive pretraining or architectural modifications, effectively addresses core video-language challenges. Notably, VideoPASTA achieves these improvements without any human annotation or captioning, relying solely on 32-frame sampling. This efficiency makes our approach a scalable plug-and-play solution that seamlessly integrates with existing models while preserving their original capabilities."
}Markdown (Informal)
[VideoPASTA: 7K Preference Pairs That Matter for Video-LLM Alignment](https://preview.aclanthology.org/ingest-emnlp/2025.emnlp-main.1647/) (Kulkarni & Fazli, EMNLP 2025)
ACL