@inproceedings{blodgett-etal-2017-dataset,
title = "A Dataset and Classifier for Recognizing Social Media {E}nglish",
author = "Blodgett, Su Lin and
Wei, Johnny and
O{'}Connor, Brendan",
editor = "Derczynski, Leon and
Xu, Wei and
Ritter, Alan and
Baldwin, Tim",
booktitle = "Proceedings of the 3rd Workshop on Noisy User-generated Text",
month = sep,
year = "2017",
address = "Copenhagen, Denmark",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/Author-page-Marten-During-lu/W17-4408/",
doi = "10.18653/v1/W17-4408",
pages = "56--61",
abstract = "While language identification works well on standard texts, it performs much worse on social media language, in particular dialectal language{---}even for English. First, to support work on English language identification, we contribute a new dataset of tweets annotated for English versus non-English, with attention to ambiguity, code-switching, and automatic generation issues. It is randomly sampled from all public messages, avoiding biases towards pre-existing language classifiers. Second, we find that a demographic language model{---}which identifies messages with language similar to that used by several U.S. ethnic populations on Twitter{---}can be used to improve English language identification performance when combined with a traditional supervised language identifier. It increases recall with almost no loss of precision, including, surprisingly, for English messages written by non-U.S. authors. Our dataset and identifier ensemble are available online."
}
Markdown (Informal)
[A Dataset and Classifier for Recognizing Social Media English](https://preview.aclanthology.org/Author-page-Marten-During-lu/W17-4408/) (Blodgett et al., WNUT 2017)
ACL