@inproceedings{yoon-etal-2025-raccoon,
title = "{RACC}oo{N}: Versatile Instructional Video Editing with Auto-Generated Narratives",
author = "Yoon, Jaehong and
Yu, Shoubin and
Bansal, Mohit",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-emnlp/2025.emnlp-main.1420/",
pages = "27960--27996",
ISBN = "979-8-89176-332-6",
abstract = "Recent video generative models primarily rely on detailed, labor-intensive text prompts for tasks, like inpainting or style editing, limiting adaptability for personal/raw videos. This paper proposes RACCooN, a versatile and user-friendly video-to-paragraph-to-video editing method, supporting diverse video editing capabilities, such as removal, addition, and modification, through a unified pipeline. RACCooN consists of two principal stages: Video-to-Paragraph (V2P), which automatically generates structured video descriptions capturing both scene context and object details, and Paragraph-to-Video (P2V), where users (optionally) refine these descriptions to guide a video diffusion model for flexible content modifications, including removing, changing subjects, and/or adding new objects. Key contributions of RACCooN include: (1) A multi-granular spatiotemporal pooling strategy for structured video understanding, capturing both broad context and fine-grained details of major objects to enable precise text-based video editing without the need for complex human annotations. (2) A video generative model fine-tuned on our curated video-paragraph-mask dataset, enhances the editing and inpainting quality. (3) The capability to seamlessly generate new objects in videos by forecasting their movements through automatically generated mask planning. In the end, users can easily edit complex videos with RACCooN{'}s automatic explanations and guidance. We demonstrate its versatile capabilities in video-to-paragraph generation (up to 9.4{\%}p absolute improvement in human evaluations) and video content editing (relative to 49.7{\%} lower FVD), and can be integrated with SoTA video generation models for further enhancement."
}Markdown (Informal)
[RACCooN: Versatile Instructional Video Editing with Auto-Generated Narratives](https://preview.aclanthology.org/ingest-emnlp/2025.emnlp-main.1420/) (Yoon et al., EMNLP 2025)
ACL