@inproceedings{rapp-2014-using-word,
title = "Using Word Familiarities and Word Associations to Measure Corpus Representativeness",
author = "Rapp, Reinhard",
booktitle = "Proceedings of the Ninth International Conference on Language Resources and Evaluation ({LREC}'14)",
month = may,
year = "2014",
address = "Reykjavik, Iceland",
publisher = "European Language Resources Association (ELRA)",
url = "http://www.lrec-conf.org/proceedings/lrec2014/pdf/492_Paper.pdf",
pages = "2029--2036",
abstract = "The definition of corpus representativeness used here assumes that a representative corpus should reflect as well as possible the average language use a native speaker encounters in everyday life over a longer period of time. As it is not practical to observe people{'}s language input over years, we suggest to utilize two types of experimental data capturing two forms of human intuitions: Word familiarity norms and word association norms. If it is true that human language acquisition is corpus-based, such data should reflect people{'}s perceived language input. Assuming so, we compute a representativeness score for a corpus by extracting word frequency and word association statistics from it and by comparing these statistics to the human data. The higher the similarity, the more representative the corpus should be for the language environments of the test persons. We present results for five different corpora and for truncated versions thereof. The results confirm the expectation that corpus size and corpus balance are crucial aspects for corpus representativeness.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="rapp-2014-using-word">
<titleInfo>
<title>Using Word Familiarities and Word Associations to Measure Corpus Representativeness</title>
</titleInfo>
<name type="personal">
<namePart type="given">Reinhard</namePart>
<namePart type="family">Rapp</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2014-may</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Ninth International Conference on Language Resources and Evaluation (LREC’14)</title>
</titleInfo>
<originInfo>
<publisher>European Language Resources Association (ELRA)</publisher>
<place>
<placeTerm type="text">Reykjavik, Iceland</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>The definition of corpus representativeness used here assumes that a representative corpus should reflect as well as possible the average language use a native speaker encounters in everyday life over a longer period of time. As it is not practical to observe people’s language input over years, we suggest to utilize two types of experimental data capturing two forms of human intuitions: Word familiarity norms and word association norms. If it is true that human language acquisition is corpus-based, such data should reflect people’s perceived language input. Assuming so, we compute a representativeness score for a corpus by extracting word frequency and word association statistics from it and by comparing these statistics to the human data. The higher the similarity, the more representative the corpus should be for the language environments of the test persons. We present results for five different corpora and for truncated versions thereof. The results confirm the expectation that corpus size and corpus balance are crucial aspects for corpus representativeness.</abstract>
<identifier type="citekey">rapp-2014-using-word</identifier>
<location>
<url>http://www.lrec-conf.org/proceedings/lrec2014/pdf/492_Paper.pdf</url>
</location>
<part>
<date>2014-may</date>
<extent unit="page">
<start>2029</start>
<end>2036</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Using Word Familiarities and Word Associations to Measure Corpus Representativeness
%A Rapp, Reinhard
%S Proceedings of the Ninth International Conference on Language Resources and Evaluation (LREC’14)
%D 2014
%8 may
%I European Language Resources Association (ELRA)
%C Reykjavik, Iceland
%F rapp-2014-using-word
%X The definition of corpus representativeness used here assumes that a representative corpus should reflect as well as possible the average language use a native speaker encounters in everyday life over a longer period of time. As it is not practical to observe people’s language input over years, we suggest to utilize two types of experimental data capturing two forms of human intuitions: Word familiarity norms and word association norms. If it is true that human language acquisition is corpus-based, such data should reflect people’s perceived language input. Assuming so, we compute a representativeness score for a corpus by extracting word frequency and word association statistics from it and by comparing these statistics to the human data. The higher the similarity, the more representative the corpus should be for the language environments of the test persons. We present results for five different corpora and for truncated versions thereof. The results confirm the expectation that corpus size and corpus balance are crucial aspects for corpus representativeness.
%U http://www.lrec-conf.org/proceedings/lrec2014/pdf/492_Paper.pdf
%P 2029-2036
Markdown (Informal)
[Using Word Familiarities and Word Associations to Measure Corpus Representativeness](http://www.lrec-conf.org/proceedings/lrec2014/pdf/492_Paper.pdf) (Rapp, LREC 2014)
ACL