@inproceedings{read-etal-2012-wesearch,
title = "The {W}e{S}earch Corpus, Treebank, and Treecache {--} A Comprehensive Sample of User-Generated Content",
author = "Read, Jonathon and
Flickinger, Dan and
Dridan, Rebecca and
Oepen, Stephan and
{\O}vrelid, Lilja",
booktitle = "Proceedings of the Eighth International Conference on Language Resources and Evaluation ({LREC}'12)",
month = may,
year = "2012",
address = "Istanbul, Turkey",
publisher = "European Language Resources Association (ELRA)",
url = "http://www.lrec-conf.org/proceedings/lrec2012/pdf/774_Paper.pdf",
pages = "1829--1835",
abstract = "We present the WeSearch Data Collection (WDC)―a freely redistributable, partly annotated, comprehensive sample of User-Generated Content. The WDC contains data extracted from a range of genres of varying formality (user forums, product review sites, blogs and Wikipedia) and covers two different domains (NLP and Linux). In this article, we describe the data selection and extraction process, with a focus on the extraction of linguistic content from different sources. We present the format of syntacto-semantic annotations found in this resource and present initial parsing results for these data, as well as some reflections following a first round of treebanking.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="read-etal-2012-wesearch">
<titleInfo>
<title>The WeSearch Corpus, Treebank, and Treecache – A Comprehensive Sample of User-Generated Content</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jonathon</namePart>
<namePart type="family">Read</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dan</namePart>
<namePart type="family">Flickinger</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rebecca</namePart>
<namePart type="family">Dridan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Stephan</namePart>
<namePart type="family">Oepen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lilja</namePart>
<namePart type="family">Øvrelid</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2012-may</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Eighth International Conference on Language Resources and Evaluation (LREC’12)</title>
</titleInfo>
<originInfo>
<publisher>European Language Resources Association (ELRA)</publisher>
<place>
<placeTerm type="text">Istanbul, Turkey</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>We present the WeSearch Data Collection (WDC)―a freely redistributable, partly annotated, comprehensive sample of User-Generated Content. The WDC contains data extracted from a range of genres of varying formality (user forums, product review sites, blogs and Wikipedia) and covers two different domains (NLP and Linux). In this article, we describe the data selection and extraction process, with a focus on the extraction of linguistic content from different sources. We present the format of syntacto-semantic annotations found in this resource and present initial parsing results for these data, as well as some reflections following a first round of treebanking.</abstract>
<identifier type="citekey">read-etal-2012-wesearch</identifier>
<location>
<url>http://www.lrec-conf.org/proceedings/lrec2012/pdf/774_Paper.pdf</url>
</location>
<part>
<date>2012-may</date>
<extent unit="page">
<start>1829</start>
<end>1835</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T The WeSearch Corpus, Treebank, and Treecache – A Comprehensive Sample of User-Generated Content
%A Read, Jonathon
%A Flickinger, Dan
%A Dridan, Rebecca
%A Oepen, Stephan
%A Øvrelid, Lilja
%S Proceedings of the Eighth International Conference on Language Resources and Evaluation (LREC’12)
%D 2012
%8 may
%I European Language Resources Association (ELRA)
%C Istanbul, Turkey
%F read-etal-2012-wesearch
%X We present the WeSearch Data Collection (WDC)―a freely redistributable, partly annotated, comprehensive sample of User-Generated Content. The WDC contains data extracted from a range of genres of varying formality (user forums, product review sites, blogs and Wikipedia) and covers two different domains (NLP and Linux). In this article, we describe the data selection and extraction process, with a focus on the extraction of linguistic content from different sources. We present the format of syntacto-semantic annotations found in this resource and present initial parsing results for these data, as well as some reflections following a first round of treebanking.
%U http://www.lrec-conf.org/proceedings/lrec2012/pdf/774_Paper.pdf
%P 1829-1835
Markdown (Informal)
[The WeSearch Corpus, Treebank, and Treecache – A Comprehensive Sample of User-Generated Content](http://www.lrec-conf.org/proceedings/lrec2012/pdf/774_Paper.pdf) (Read et al., LREC 2012)
ACL