@inproceedings{adouane-dobnik-2017-identification,
title = "Identification of Languages in {A}lgerian {A}rabic Multilingual Documents",
author = "Adouane, Wafia and
Dobnik, Simon",
editor = "Habash, Nizar and
Diab, Mona and
Darwish, Kareem and
El-Hajj, Wassim and
Al-Khalifa, Hend and
Bouamor, Houda and
Tomeh, Nadi and
El-Haj, Mahmoud and
Zaghouani, Wajdi",
booktitle = "Proceedings of the Third {A}rabic Natural Language Processing Workshop",
month = apr,
year = "2017",
address = "Valencia, Spain",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/W17-1301",
doi = "10.18653/v1/W17-1301",
pages = "1--8",
abstract = "This paper presents a language identification system designed to detect the language of each word, in its context, in a multilingual documents as generated in social media by bilingual/multilingual communities, in our case speakers of Algerian Arabic. We frame the task as a sequence tagging problem and use supervised machine learning with standard methods like HMM and Ngram classification tagging. We also experiment with a lexicon-based method. Combining all the methods in a fall-back mechanism and introducing some linguistic rules, to deal with unseen tokens and ambiguous words, gives an overall accuracy of 93.14{\%}. Finally, we introduced rules for language identification from sequences of recognised words.",
}
Markdown (Informal)
[Identification of Languages in Algerian Arabic Multilingual Documents](https://aclanthology.org/W17-1301) (Adouane & Dobnik, WANLP 2017)
ACL