@inproceedings{iranmanesh-etal-2026-segmentation,
title = "Segmentation Strategy Matters: Benchmarking Whisper on {P}ersian {Y}ou{T}ube Content",
author = "Iranmanesh, Reihaneh and
Ziaei, Rojin and
Garman, Joe",
editor = "Merchant, Rayyan and
Megerdoomian, Karine",
booktitle = "The Proceedings of the First Workshop on {NLP} and {LLM}s for the {I}ranian Language Family",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/manual-author-scripts/2026.silkroadnlp-1.13/",
pages = "121--130",
ISBN = "979-8-89176-371-5",
abstract = "Automatic Speech Recognition (ASR) transcription accuracy remains highly sensitive to audio segmentation strategies, yet most benchmarks assume oracle timestamps unavailable in deployment. We systematically evaluate how audio segmentation affects Whisper{'}s performance on 10 hours of Persian YouTube content, comparing transcript-aligned (oracle) versus silence-based (realistic) approaches across contrasting acoustic conditions. Results reveal striking content-type dependency: podcast content benefits from timestamp segmentation (33{\%} lower mean WER), while entertainment content favors silence-based segmentation (8{\%} lower mean WER). This finding demonstrates that optimal segmentation must be content-aware, with silence detection better capturing natural boundaries in acoustically heterogeneous media while avoiding mid-utterance splits. We publicly release our evaluation framework, 10 hours of audio with gold transcripts, and segmentation results here: https://github.com/ri164-bolleit/persian-youtube-whisper-benchmark"
}Markdown (Informal)
[Segmentation Strategy Matters: Benchmarking Whisper on Persian YouTube Content](https://preview.aclanthology.org/manual-author-scripts/2026.silkroadnlp-1.13/) (Iranmanesh et al., SilkRoadNLP 2026)
ACL