@inproceedings{rizal-stymne-2020-evaluating,
title = "Evaluating Word Embeddings for {I}ndonesian{--}{E}nglish Code-Mixed Text Based on Synthetic Data",
author = "Rizal, Arra{'}Di Nur and
Stymne, Sara",
editor = "Solorio, Thamar and
Choudhury, Monojit and
Bali, Kalika and
Sitaram, Sunayana and
Das, Amitava and
Diab, Mona",
booktitle = "Proceedings of the 4th Workshop on Computational Approaches to Code Switching",
month = may,
year = "2020",
address = "Marseille, France",
publisher = "European Language Resources Association",
url = "https://preview.aclanthology.org/jlcl-multiple-ingestion/2020.calcs-1.4/",
pages = "26--35",
language = "eng",
ISBN = "979-10-95546-66-5",
abstract = "Code-mixed texts are abundant, especially in social media, and poses a problem for NLP tools, which are typically trained on monolingual corpora. In this paper, we explore and evaluate different types of word embeddings for Indonesian{--}English code-mixed text. We propose the use of code-mixed embeddings, i.e. embeddings trained on code-mixed text. Because large corpora of code-mixed text are required to train embeddings, we describe a method for synthesizing a code-mixed corpus, grounded in literature and a survey. Using sentiment analysis as a case study, we show that code-mixed embeddings trained on synthesized data are at least as good as cross-lingual embeddings and better than monolingual embeddings."
}
Markdown (Informal)
[Evaluating Word Embeddings for Indonesian–English Code-Mixed Text Based on Synthetic Data](https://preview.aclanthology.org/jlcl-multiple-ingestion/2020.calcs-1.4/) (Rizal & Stymne, CALCS 2020)
ACL