@inproceedings{znotins-gruzitis-2025-conversational,
title = "From Conversational Speech to Readable Text: Post-Processing Noisy Transcripts in a Low-Resource Setting",
author = "Znotins, Arturs and
Gruzitis, Normunds and
Dargis, Roberts",
editor = "Bak, JinYeong and
Goot, Rob van der and
Jang, Hyeju and
Buaphet, Weerayut and
Ramponi, Alan and
Xu, Wei and
Ritter, Alan",
booktitle = "Proceedings of the Tenth Workshop on Noisy and User-generated Text",
month = may,
year = "2025",
address = "Albuquerque, New Mexico, USA",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/fix-sig-urls/2025.wnut-1.15/",
pages = "143--148",
ISBN = "979-8-89176-232-9",
abstract = "We present ongoing research on automatic post-processing approaches to enhance the readability of noisy speech transcripts in low-resource languages, with a focus on conversational speech in Latvian. We compare transformer-based sequence-labeling models and large language models (LLMs) for the standard punctuation and capitalization restoration task, while also considering automatic correction of mispronounced words and disfluency, and partial inverse text normalization. Our results show that very small LLMs (approx. 2B parameters), fine-tuned on a modest text corpus, can achieve near state-of-the-art performance, rivaling orders of magnitude larger LLMs. Additionally, we demonstrate that a fine-tuned Whisper model, leveraging acoustic cues, outperforms text-only systems on challenging conversational data, even for a low-resource language. Error analysis reveals recurring pitfalls in sentence boundary determination and disfluency handling, emphasizing the importance of consistent annotation and domain adaptation for robust post-processing. Our findings highlight the feasibility of developing efficient post-processing solutions that significantly refine ASR output in low-resource settings, while opening new possibilities for editing and formatting speech transcripts beyond mere restoration of punctuation and capitalization."
}
Markdown (Informal)
[From Conversational Speech to Readable Text: Post-Processing Noisy Transcripts in a Low-Resource Setting](https://preview.aclanthology.org/fix-sig-urls/2025.wnut-1.15/) (Znotins et al., WNUT 2025)
ACL