@inproceedings{mcnamee-2016-language,
title = "Language and Dialect Discrimination Using Compression-Inspired Language Models",
author = "McNamee, Paul",
editor = {Nakov, Preslav and
Zampieri, Marcos and
Tan, Liling and
Ljube{\v{s}}i{\'c}, Nikola and
Tiedemann, J{\"o}rg and
Malmasi, Shervin},
booktitle = "Proceedings of the Third Workshop on {NLP} for Similar Languages, Varieties and Dialects ({V}ar{D}ial3)",
month = dec,
year = "2016",
address = "Osaka, Japan",
publisher = "The COLING 2016 Organizing Committee",
url = "https://preview.aclanthology.org/Ingest-2025-COMPUTEL/W16-4825/",
pages = "195--203",
abstract = "The DSL 2016 shared task continued previous evaluations from 2014 and 2015 that facilitated the study of automated language and dialect identification. This paper describes results for this year`s shared task and from several related experiments conducted at the Johns Hopkins University Human Language Technology Center of Excellence (JHU HLTCOE). Previously the HLTCOE has explored the use of compression-inspired language modeling for language and dialect identification, using news, Wikipedia, blog post, and Twitter corpora. The technique we have relied upon is based on prediction by partial matching (PPM), a state of the art text compression technique. Due to the close relationship between adaptive compression and language modeling, such compression techniques can also be applied to multi-way text classification problems, and previous studies have examined tasks such as authorship attribution, email spam detection, and topical classification. We applied our approach to the multi-class decision that considered each dialect or language as a possibility for the given shared task input line. Results for test-set A were in accord with our expectations, however results for test-sets B and C appear to be markedly worse. We had not anticipated the inclusion of multiple communications in differing languages in test-set B (social media) input lines, and had not expected the test-set C (dialectal Arabic) data to be represented phonetically instead of in native orthography."
}
Markdown (Informal)
[Language and Dialect Discrimination Using Compression-Inspired Language Models](https://preview.aclanthology.org/Ingest-2025-COMPUTEL/W16-4825/) (McNamee, VarDial 2016)
ACL