@inproceedings{sterner-teufel-2023-tongueswitcher,
title = "{T}ongue{S}witcher: Fine-Grained Identification of {G}erman-{E}nglish Code-Switching",
author = "Sterner, Igor and
Teufel, Simone",
editor = "Winata, Genta and
Kar, Sudipta and
Zhukova, Marina and
Solorio, Thamar and
Diab, Mona and
Sitaram, Sunayana and
Choudhury, Monojit and
Bali, Kalika",
booktitle = "Proceedings of the 6th Workshop on Computational Approaches to Linguistic Code-Switching",
month = dec,
year = "2023",
address = "Singapore",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.calcs-1.1",
doi = "10.18653/v1/2023.calcs-1.1",
pages = "1--13",
abstract = "This paper contributes to German-English code-switching research. We provide the largest corpus of naturally occurring German-English code-switching, where English is included in German text, and two methods for code-switching identification. The first method is rule-based, using wordlists and morphological processing. We use this method to compile a corpus of 25.6M tweets employing German-English code-switching. In our second method, we continue pretraining of a neural language model on this corpus and classify tokens based on embeddings from this language model. Our systems establish SoTA on our new corpus and an existing German-English code-switching benchmark. In particular, we systematically study code-switching for language-ambiguous words which can only be resolved in context, and morphologically mixed words consisting of both English and German morphemes. We distribute both corpora and systems to the research community.",
}
Markdown (Informal)
[TongueSwitcher: Fine-Grained Identification of German-English Code-Switching](https://aclanthology.org/2023.calcs-1.1) (Sterner & Teufel, CALCS-WS 2023)
ACL