@inproceedings{nali-jana-2026-cascaded,
title = "Cascaded Modular or End-to-End? : An Investigation on Speech-to-Speech Translation Task for {D}ravidian Languages",
author = "Nali, Bhavana and
Jana, Abhik",
editor = "Chakravarthi, Bharathi Raja and
Priyadharshini, Ruba and
Madasamy, Anand Kumar and
Thavareesan, Sajeetha and
Rajiakodi, Saranya and
Navaneethakrishnan, Subalalitha and
Chinnappa, Dhivya and
Palani, Balasubramanian and
Subramanian, Malliga and
Shanmugavadivel, Kogilavani and
Rajalakshmi, Ratnavel",
booktitle = "Proceedings of the Sixth Workshop on Speech, Vision, and Language Technologies for {D}ravidian Languages",
month = jul,
year = "2026",
address = "Underline (Virtual)",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl-workshops/2026.dravidianlangtech-1.3/",
pages = "14--20",
ISBN = "979-8-89176-401-9",
abstract = "This paper presents a study of speech-to-speech translation for low-resource Dravidian languages, focusing on Tamil, Telugu, and Kannada. We investigate the efficacy of the Cascaded Modular system with the End-to-end system in both zero-shot and fine-tuned settings. The Cascaded Modular approach combines an ASR Module (Whisper-based ASR for English speech; IndicConformer for Dravidian speech), a Text-to-Text translation module (IndicTrans2), and a Speech synthesis module (Indic Parler-TTS), whereas SeamlessM4T is used as the End-to-end system. For parameter-efficient Low-Rank Adaptation (LoRA) fine-tuning to adapt the translation component to a domain-specific dataset, we use FLEURS and Mann-ki-Baat (a subset of BhasaAnuvaad dataset). Cascaded Modular systems achieve BLEU scores ranging from 3.17 to 19.18 in the zero-shot setting and 5.08 to 19.18 after fine-tuning, whereas the End-to-end model ranges from 3.02 to 15.72 in zero-shot settings across languages and 4.11 to 16.84 after fine-tuning. The results show that Cascaded Modular systems consistently outperform the End-to-end model across both setups. Note that parameter-efficient fine-tuning yields significant improvements in translation quality and speech generation performance for low-resource Dravidian speech translation."
}Markdown (Informal)
[Cascaded Modular or End-to-End? : An Investigation on Speech-to-Speech Translation Task for Dravidian Languages](https://preview.aclanthology.org/ingest-acl-workshops/2026.dravidianlangtech-1.3/) (Nali & Jana, DravidianLangTech 2026)
ACL