@inproceedings{ditchfield-ogle-mitkov-2025-comparative,
title = "A Comparative Study of Vision Transformers and Multimodal Language Models for Violence Detection in Videos",
author = "Ditchfield-Ogle, Tomas and
Mitkov, Ruslan",
editor = "Picazo-Izquierdo, Alicia and
Estevanell-Valladares, Ernesto Luis and
Mitkov, Ruslan and
Guillena, Rafael Mu{\~n}oz and
Cerd{\'a}, Ra{\'u}l Garc{\'i}a",
booktitle = "Proceedings of the First Workshop on Comparative Performance Evaluation: From Rules to Language Models",
month = sep,
year = "2025",
address = "Varna, Bulgaria",
publisher = "INCOMA Ltd., Shoumen, Bulgaria",
url = "https://preview.aclanthology.org/corrections-2026-01/2025.r2lm-1.2/",
pages = "10--20",
abstract = "This project compares methods for de- tecting violent videos, which are crucial for ensuring real-time safety in surveil- lance and digital moderation. It evaluates four approaches: a random forest classi- fier, a transformer model, and two multi- modal vision-language models. The pro- cess involves preprocessing datasets, train- ing models, and assessing accuracy, inter- pretability, scalability, and real-time suit- ability. Results show that traditional meth- ods are simple but less effective. The trans- former model achieved high accuracy, and the multimodal models offered high vio- lence recall with descriptive justifications. The study highlights trade-offs and pro- vides practical insights for the deployment of automated violence detection."
}Markdown (Informal)
[A Comparative Study of Vision Transformers and Multimodal Language Models for Violence Detection in Videos](https://preview.aclanthology.org/corrections-2026-01/2025.r2lm-1.2/) (Ditchfield-Ogle & Mitkov, R2LM 2025)
ACL