@inproceedings{e-etal-2023-copara,
title = "{C}o{P}ara: The First {D}ravidian Paragraph-level n-way Aligned Corpus",
author = "E, Nikhil and
Choudhary, Mukund and
Mamidi, Radhika",
editor = "Chakravarthi, Bharathi R. and
Priyadharshini, Ruba and
M, Anand Kumar and
Thavareesan, Sajeetha and
Sherly, Elizabeth",
booktitle = "Proceedings of the Third Workshop on Speech and Language Technologies for Dravidian Languages",
month = sep,
year = "2023",
address = "Varna, Bulgaria",
publisher = "INCOMA Ltd., Shoumen, Bulgaria",
url = "https://preview.aclanthology.org/fix-sig-urls/2023.dravidianlangtech-1.12/",
pages = "88--96",
abstract = "We present CoPara, the first publicly available paragraph-level (n-way aligned) multilingual parallel corpora for Dravidian languages. The collection contains 2856 paragraph/passage pairs between English and four Dravidian languages. We source the parallel paragraphs from the New India Samachar magazine and align them with English as a pivot language. We do human and artificial evaluations to validate the high-quality alignment and richness of the parallel paragraphs of a range of lengths. To show one of the many ways this dataset can be wielded, we finetuned IndicBART, a seq2seq NMT model on all XX-En pairs of languages in CoPara which perform better than existing sentence-level models on standard benchmarks (like BLEU) on sentence level translations and longer text too. We show how this dataset can enrich a model trained for a task like this, with more contextual cues and beyond sentence understanding even in low-resource settings like that of Dravidian languages. Finally, the dataset and models are made available publicly at CoPara to help advance research in Dravidian NLP, parallel multilingual, and beyond sentence-level tasks like NMT, etc."
}
Markdown (Informal)
[CoPara: The First Dravidian Paragraph-level n-way Aligned Corpus](https://preview.aclanthology.org/fix-sig-urls/2023.dravidianlangtech-1.12/) (E et al., DravidianLangTech 2023)
ACL