@inproceedings{hubkova-etal-2020-czech,
title = "{C}zech Historical Named Entity Corpus v 1.0",
author = "Hubkov{\'a}, Helena and
Kral, Pavel and
Pettersson, Eva",
booktitle = "Proceedings of the 12th Language Resources and Evaluation Conference",
month = may,
year = "2020",
address = "Marseille, France",
publisher = "European Language Resources Association",
url = "https://aclanthology.org/2020.lrec-1.549",
pages = "4458--4465",
abstract = "As the number of digitized archival documents increases very rapidly, named entity recognition (NER) in historical documents has become very important for information extraction and data mining. For this task an annotated corpus is needed, which has up to now been missing for Czech. In this paper we present a new annotated data collection for historical NER, composed of Czech historical newspapers. This corpus is freely available for research purposes. For this corpus, we have defined relevant domain-specific named entity types and created an annotation manual for corpus labelling. We further conducted some experiments on this corpus using recurrent neural networks. We experimented with randomly initialized embeddings and static and dynamic fastText word embeddings. We achieved 0.73 F1 score with a bidirectional LSTM model using static fastText embeddings.",
language = "English",
ISBN = "979-10-95546-34-4",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="hubkova-etal-2020-czech">
<titleInfo>
<title>Czech Historical Named Entity Corpus v 1.0</title>
</titleInfo>
<name type="personal">
<namePart type="given">Helena</namePart>
<namePart type="family">Hubková</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pavel</namePart>
<namePart type="family">Kral</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Eva</namePart>
<namePart type="family">Pettersson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-may</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<language>
<languageTerm type="text">English</languageTerm>
<languageTerm type="code" authority="iso639-2b">eng</languageTerm>
</language>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 12th Language Resources and Evaluation Conference</title>
</titleInfo>
<originInfo>
<publisher>European Language Resources Association</publisher>
<place>
<placeTerm type="text">Marseille, France</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-10-95546-34-4</identifier>
</relatedItem>
<abstract>As the number of digitized archival documents increases very rapidly, named entity recognition (NER) in historical documents has become very important for information extraction and data mining. For this task an annotated corpus is needed, which has up to now been missing for Czech. In this paper we present a new annotated data collection for historical NER, composed of Czech historical newspapers. This corpus is freely available for research purposes. For this corpus, we have defined relevant domain-specific named entity types and created an annotation manual for corpus labelling. We further conducted some experiments on this corpus using recurrent neural networks. We experimented with randomly initialized embeddings and static and dynamic fastText word embeddings. We achieved 0.73 F1 score with a bidirectional LSTM model using static fastText embeddings.</abstract>
<identifier type="citekey">hubkova-etal-2020-czech</identifier>
<location>
<url>https://aclanthology.org/2020.lrec-1.549</url>
</location>
<part>
<date>2020-may</date>
<extent unit="page">
<start>4458</start>
<end>4465</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Czech Historical Named Entity Corpus v 1.0
%A Hubková, Helena
%A Kral, Pavel
%A Pettersson, Eva
%S Proceedings of the 12th Language Resources and Evaluation Conference
%D 2020
%8 may
%I European Language Resources Association
%C Marseille, France
%@ 979-10-95546-34-4
%G English
%F hubkova-etal-2020-czech
%X As the number of digitized archival documents increases very rapidly, named entity recognition (NER) in historical documents has become very important for information extraction and data mining. For this task an annotated corpus is needed, which has up to now been missing for Czech. In this paper we present a new annotated data collection for historical NER, composed of Czech historical newspapers. This corpus is freely available for research purposes. For this corpus, we have defined relevant domain-specific named entity types and created an annotation manual for corpus labelling. We further conducted some experiments on this corpus using recurrent neural networks. We experimented with randomly initialized embeddings and static and dynamic fastText word embeddings. We achieved 0.73 F1 score with a bidirectional LSTM model using static fastText embeddings.
%U https://aclanthology.org/2020.lrec-1.549
%P 4458-4465
Markdown (Informal)
[Czech Historical Named Entity Corpus v 1.0](https://aclanthology.org/2020.lrec-1.549) (Hubková et al., LREC 2020)
ACL
- Helena Hubková, Pavel Kral, and Eva Pettersson. 2020. Czech Historical Named Entity Corpus v 1.0. In Proceedings of the 12th Language Resources and Evaluation Conference, pages 4458–4465, Marseille, France. European Language Resources Association.