@inproceedings{balabel-etal-2020-cairo,
title = "Cairo Student Code-Switch ({CSCS}) Corpus: An Annotated {E}gyptian {A}rabic-{E}nglish Corpus",
author = {Balabel, Mohamed and
Hamed, Injy and
Abdennadher, Slim and
Vu, Ngoc Thang and
{\c{C}}etino{\u{g}}lu, {\"O}zlem},
booktitle = "Proceedings of the 12th Language Resources and Evaluation Conference",
month = may,
year = "2020",
address = "Marseille, France",
publisher = "European Language Resources Association",
url = "https://aclanthology.org/2020.lrec-1.489",
pages = "3973--3977",
abstract = "Code-switching has become a prevalent phenomenon across many communities. It poses a challenge to NLP researchers, mainly due to the lack of available data needed for training and testing applications. In this paper, we introduce a new resource: a corpus of Egyptian- Arabic code-switch speech data that is fully tokenized, lemmatized and annotated for part-of-speech tags. Beside the corpus itself, we provide annotation guidelines to address the unique challenges of annotating code-switch data. Another challenge that we address is the fact that Egyptian Arabic orthography and grammar are not standardized.",
language = "English",
ISBN = "979-10-95546-34-4",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="balabel-etal-2020-cairo">
<titleInfo>
<title>Cairo Student Code-Switch (CSCS) Corpus: An Annotated Egyptian Arabic-English Corpus</title>
</titleInfo>
<name type="personal">
<namePart type="given">Mohamed</namePart>
<namePart type="family">Balabel</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Injy</namePart>
<namePart type="family">Hamed</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Slim</namePart>
<namePart type="family">Abdennadher</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ngoc</namePart>
<namePart type="given">Thang</namePart>
<namePart type="family">Vu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Özlem</namePart>
<namePart type="family">Çetinoğlu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-may</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<language>
<languageTerm type="text">English</languageTerm>
<languageTerm type="code" authority="iso639-2b">eng</languageTerm>
</language>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 12th Language Resources and Evaluation Conference</title>
</titleInfo>
<originInfo>
<publisher>European Language Resources Association</publisher>
<place>
<placeTerm type="text">Marseille, France</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-10-95546-34-4</identifier>
</relatedItem>
<abstract>Code-switching has become a prevalent phenomenon across many communities. It poses a challenge to NLP researchers, mainly due to the lack of available data needed for training and testing applications. In this paper, we introduce a new resource: a corpus of Egyptian- Arabic code-switch speech data that is fully tokenized, lemmatized and annotated for part-of-speech tags. Beside the corpus itself, we provide annotation guidelines to address the unique challenges of annotating code-switch data. Another challenge that we address is the fact that Egyptian Arabic orthography and grammar are not standardized.</abstract>
<identifier type="citekey">balabel-etal-2020-cairo</identifier>
<location>
<url>https://aclanthology.org/2020.lrec-1.489</url>
</location>
<part>
<date>2020-may</date>
<extent unit="page">
<start>3973</start>
<end>3977</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Cairo Student Code-Switch (CSCS) Corpus: An Annotated Egyptian Arabic-English Corpus
%A Balabel, Mohamed
%A Hamed, Injy
%A Abdennadher, Slim
%A Vu, Ngoc Thang
%A Çetinoğlu, Özlem
%S Proceedings of the 12th Language Resources and Evaluation Conference
%D 2020
%8 may
%I European Language Resources Association
%C Marseille, France
%@ 979-10-95546-34-4
%G English
%F balabel-etal-2020-cairo
%X Code-switching has become a prevalent phenomenon across many communities. It poses a challenge to NLP researchers, mainly due to the lack of available data needed for training and testing applications. In this paper, we introduce a new resource: a corpus of Egyptian- Arabic code-switch speech data that is fully tokenized, lemmatized and annotated for part-of-speech tags. Beside the corpus itself, we provide annotation guidelines to address the unique challenges of annotating code-switch data. Another challenge that we address is the fact that Egyptian Arabic orthography and grammar are not standardized.
%U https://aclanthology.org/2020.lrec-1.489
%P 3973-3977
Markdown (Informal)
[Cairo Student Code-Switch (CSCS) Corpus: An Annotated Egyptian Arabic-English Corpus](https://aclanthology.org/2020.lrec-1.489) (Balabel et al., LREC 2020)
ACL