@inproceedings{naswan-ahmad-2026-wave2word,
title = "{W}ave2{W}ord@{D}ravidian{L}ang{T}ech 2026: {W}his{T}am: A unified framework for dialect based {T}amil speech recognition and classification",
author = "Naswan, Ruwad and
Ahmad, Shadab Tanjeed",
editor = "Chakravarthi, Bharathi Raja and
Priyadharshini, Ruba and
Madasamy, Anand Kumar and
Thavareesan, Sajeetha and
Rajiakodi, Saranya and
Navaneethakrishnan, Subalalitha and
Chinnappa, Dhivya and
Palani, Balasubramanian and
Subramanian, Malliga and
Shanmugavadivel, Kogilavani and
Rajalakshmi, Ratnavel",
booktitle = "Proceedings of the Sixth Workshop on Speech, Vision, and Language Technologies for {D}ravidian Languages",
month = jul,
year = "2026",
address = "Underline (Virtual)",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl-workshops/2026.dravidianlangtech-1.70/",
pages = "442--446",
ISBN = "979-8-89176-401-9",
abstract = "While Automatic Speech Recognition (ASR) systems have shown impressive performance in languages having sufficient annotated speech data like English, their performance is still limited for low-resource, dialect rich languages like Tamil. Tamil poses further challenges because of its extremely high regional variation in dialects that manifest in varying vocabulary, pronunciations, and even syntactic structures. To address these challenges, we present a unified framework WhisTam based on the Whisper medium model, which performs speech transcription and dialect classification jointly within a single system. Our method is evaluated against speech samples from four regional dialects and achieves a macro F1-score of 0.53 and a Word Error Rate (WER) of 0.55 for dialect classification and transcription respectively, ranking 2nd in the dialect classification task and 3rd in the transcription task in the DravidianLangTech@ACL 2026 shared task on Dialect-based Speech Recognition and Classification in Tamil. These findings emphasize the challenges in dialectal Tamil ASR as well as the promise of multi-task learning for low-resource languages. Our implementation is publicly available at: https://github.com/rwd51/DravidianLangTech-Wave2Word."
}Markdown (Informal)
[Wave2Word@DravidianLangTech 2026: WhisTam: A unified framework for dialect based Tamil speech recognition and classification](https://preview.aclanthology.org/ingest-acl-workshops/2026.dravidianlangtech-1.70/) (Naswan & Ahmad, DravidianLangTech 2026)
ACL