@inproceedings{asha-hosahalli-lakshmaiah-2023-kt2,
title = "{KT}2: {K}annada-{T}ulu Parallel Corpus Construction for Neural Machine Translation",
author = "Hegde, Asha and
Shashirekha, Hosahalli Lakshmaiah",
editor = "D. Pawar, Jyoti and
Lalitha Devi, Sobha",
booktitle = "Proceedings of the 20th International Conference on Natural Language Processing (ICON)",
month = dec,
year = "2023",
address = "Goa University, Goa, India",
publisher = "NLP Association of India (NLPAI)",
url = "https://preview.aclanthology.org/jlcl-multiple-ingestion/2023.icon-1.75/",
pages = "743--753",
abstract = "In the last decade, Neural Machine Translation (NMT) has experienced substantial advances. However, its widespread success has revealed a limitation in terms of reduced proficiency when dealing with under-resourced language pairs, mainly due to the lack of parallel corpora in comparison to high-resourced language pairs like English-German, EnglishSpanish, and English-French. As a result, researchers have increasingly focused on implementing NMT techniques tailored to underresourced language pairs and thereby, the construction/collection of parallel corpora. In view of the scarcity of parallel corpus for underresourced languages, the strategies for building a Kannada-Tulu parallel corpus and baseline models for Machine Translation (MT) of Kannada-Tulu are described in this paper. Both Kannada and Tulu languages are under-resourced due to lack of processing tools and digital resources, especially parallel corpora, which are critical for MT development. Kannada-Tulu parallel corpus is constructed in two ways: i) Manual Translation and ii) Automatic Text Generation (ATG). Various encoderdecoder based NMT approaches, including Recurrent Neural Network (RNN), Bidirectional RNN (BiRNN), and transformer-based architectures, trained with Gated Recurrent Units (GRU) and Long Short Term Memory (LSTM) units, are explored as baseline models for Kannada to Tulu (Kan-Tul) and Tulu to Kannada (Kan-Tul) sentence-level translations. Additionally, the study explores sub-word tokenization techniques for Kannada-Tulu language pairs, and the performances of these NMT models are evaluated using Character n-gram Fscore (CHRF) and Bilingual Evaluation Understudy (BLEU) scores. Among the baselines, the transformer-based models outperformed other models with BLEU scores of 0.241 and 0.341 and CHRF scores of 0.502 and 0.598 for KanTul and Kan-Tul sentence-level translations, respectively."
}
Markdown (Informal)
[KT2: Kannada-Tulu Parallel Corpus Construction for Neural Machine Translation](https://preview.aclanthology.org/jlcl-multiple-ingestion/2023.icon-1.75/) (Hegde & Shashirekha, ICON 2023)
ACL