@inproceedings{alatrash-etal-2020-ccoha,
title = "{CCOHA}: Clean Corpus of Historical {A}merican {E}nglish",
author = "Alatrash, Reem and
Schlechtweg, Dominik and
Kuhn, Jonas and
Schulte im Walde, Sabine",
booktitle = "Proceedings of the 12th Language Resources and Evaluation Conference",
month = may,
year = "2020",
address = "Marseille, France",
publisher = "European Language Resources Association",
url = "https://aclanthology.org/2020.lrec-1.859",
pages = "6958--6966",
abstract = "Modelling language change is an increasingly important area of interest within the fields of sociolinguistics and historical linguistics. In recent years, there has been a growing number of publications whose main concern is studying changes that have occurred within the past centuries. The Corpus of Historical American English (COHA) is one of the most commonly used large corpora in diachronic studies in English. This paper describes methods applied to the downloadable version of the COHA corpus in order to overcome its main limitations, such as inconsistent lemmas and malformed tokens, without compromising its qualitative and distributional properties. The resulting corpus CCOHA contains a larger number of cleaned word tokens which can offer better insights into language change and allow for a larger variety of tasks to be performed.",
language = "English",
ISBN = "979-10-95546-34-4",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="alatrash-etal-2020-ccoha">
<titleInfo>
<title>CCOHA: Clean Corpus of Historical American English</title>
</titleInfo>
<name type="personal">
<namePart type="given">Reem</namePart>
<namePart type="family">Alatrash</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dominik</namePart>
<namePart type="family">Schlechtweg</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jonas</namePart>
<namePart type="family">Kuhn</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sabine</namePart>
<namePart type="family">Schulte im Walde</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-may</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<language>
<languageTerm type="text">English</languageTerm>
<languageTerm type="code" authority="iso639-2b">eng</languageTerm>
</language>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 12th Language Resources and Evaluation Conference</title>
</titleInfo>
<originInfo>
<publisher>European Language Resources Association</publisher>
<place>
<placeTerm type="text">Marseille, France</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-10-95546-34-4</identifier>
</relatedItem>
<abstract>Modelling language change is an increasingly important area of interest within the fields of sociolinguistics and historical linguistics. In recent years, there has been a growing number of publications whose main concern is studying changes that have occurred within the past centuries. The Corpus of Historical American English (COHA) is one of the most commonly used large corpora in diachronic studies in English. This paper describes methods applied to the downloadable version of the COHA corpus in order to overcome its main limitations, such as inconsistent lemmas and malformed tokens, without compromising its qualitative and distributional properties. The resulting corpus CCOHA contains a larger number of cleaned word tokens which can offer better insights into language change and allow for a larger variety of tasks to be performed.</abstract>
<identifier type="citekey">alatrash-etal-2020-ccoha</identifier>
<location>
<url>https://aclanthology.org/2020.lrec-1.859</url>
</location>
<part>
<date>2020-may</date>
<extent unit="page">
<start>6958</start>
<end>6966</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T CCOHA: Clean Corpus of Historical American English
%A Alatrash, Reem
%A Schlechtweg, Dominik
%A Kuhn, Jonas
%A Schulte im Walde, Sabine
%S Proceedings of the 12th Language Resources and Evaluation Conference
%D 2020
%8 may
%I European Language Resources Association
%C Marseille, France
%@ 979-10-95546-34-4
%G English
%F alatrash-etal-2020-ccoha
%X Modelling language change is an increasingly important area of interest within the fields of sociolinguistics and historical linguistics. In recent years, there has been a growing number of publications whose main concern is studying changes that have occurred within the past centuries. The Corpus of Historical American English (COHA) is one of the most commonly used large corpora in diachronic studies in English. This paper describes methods applied to the downloadable version of the COHA corpus in order to overcome its main limitations, such as inconsistent lemmas and malformed tokens, without compromising its qualitative and distributional properties. The resulting corpus CCOHA contains a larger number of cleaned word tokens which can offer better insights into language change and allow for a larger variety of tasks to be performed.
%U https://aclanthology.org/2020.lrec-1.859
%P 6958-6966
Markdown (Informal)
[CCOHA: Clean Corpus of Historical American English](https://aclanthology.org/2020.lrec-1.859) (Alatrash et al., LREC 2020)
ACL
- Reem Alatrash, Dominik Schlechtweg, Jonas Kuhn, and Sabine Schulte im Walde. 2020. CCOHA: Clean Corpus of Historical American English. In Proceedings of the 12th Language Resources and Evaluation Conference, pages 6958–6966, Marseille, France. European Language Resources Association.