@inproceedings{gautam-etal-2021-translate,
title = "Translate and Classify: Improving Sequence Level Classification for {E}nglish-{H}indi Code-Mixed Data",
author = "Gautam, Devansh and
Gupta, Kshitij and
Shrivastava, Manish",
editor = "Solorio, Thamar and
Chen, Shuguang and
Black, Alan W. and
Diab, Mona and
Sitaram, Sunayana and
Soto, Victor and
Yilmaz, Emre and
Srinivasan, Anirudh",
booktitle = "Proceedings of the Fifth Workshop on Computational Approaches to Linguistic Code-Switching",
month = jun,
year = "2021",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/add-emnlp-2024-awards/2021.calcs-1.3/",
doi = "10.18653/v1/2021.calcs-1.3",
pages = "15--25",
abstract = "Code-mixing is a common phenomenon in multilingual societies around the world and is especially common in social media texts. Traditional NLP systems, usually trained on monolingual corpora, do not perform well on code-mixed texts. Training specialized models for code-switched texts is difficult due to the lack of large-scale datasets. Translating code-mixed data into standard languages like English could improve performance on various code-mixed tasks since we can use transfer learning from state-of-the-art English models for processing the translated data. This paper focuses on two sequence-level classification tasks for English-Hindi code mixed texts, which are part of the GLUECoS benchmark - Natural Language Inference and Sentiment Analysis. We propose using various pre-trained models that have been fine-tuned for similar English-only tasks and have shown state-of-the-art performance. We further fine-tune these models on the translated code-mixed datasets and achieve state-of-the-art performance in both tasks. To translate English-Hindi code-mixed data to English, we use mBART, a pre-trained multilingual sequence-to-sequence model that has shown competitive performance on various low-resource machine translation pairs and has also shown performance gains in languages that were not in its pre-training corpus."
}
Markdown (Informal)
[Translate and Classify: Improving Sequence Level Classification for English-Hindi Code-Mixed Data](https://preview.aclanthology.org/add-emnlp-2024-awards/2021.calcs-1.3/) (Gautam et al., CALCS 2021)
ACL