@inproceedings{vincze-etal-2014-szeged,
title = "{S}zeged Corpus 2.5: Morphological Modifications in a Manually {POS}-tagged {H}ungarian Corpus",
author = "Vincze, Veronika and
Varga, Viktor and
Simk{\'o}, Katalin Ilona and
Zsibrita, J{\'a}nos and
Nagy, {\'A}goston and
Farkas, Rich{\'a}rd and
Csirik, J{\'a}nos",
booktitle = "Proceedings of the Ninth International Conference on Language Resources and Evaluation ({LREC}'14)",
month = may,
year = "2014",
address = "Reykjavik, Iceland",
publisher = "European Language Resources Association (ELRA)",
url = "http://www.lrec-conf.org/proceedings/lrec2014/pdf/262_Paper.pdf",
pages = "1074--1078",
abstract = "The Szeged Corpus is the largest manually annotated database containing the possible morphological analyses and lemmas for each word form. In this work, we present its latest version, Szeged Corpus 2.5, in which the new harmonized morphological coding system of Hungarian has been employed and, on the other hand, the majority of misspelled words have been corrected and tagged with the proper morphological code. New morphological codes are introduced for participles, causative / modal / frequentative verbs, adverbial pronouns and punctuation marks, moreover, the distinction between common and proper nouns is eliminated. We also report some statistical data on the frequency of the new morphological codes. The new version of the corpus made it possible to train magyarlanc, a data-driven POS-tagger of Hungarian on a dataset with the new harmonized codes. According to the results, magyarlanc is able to achieve a state-of-the-art accuracy score on the 2.5 version as well.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="vincze-etal-2014-szeged">
<titleInfo>
<title>Szeged Corpus 2.5: Morphological Modifications in a Manually POS-tagged Hungarian Corpus</title>
</titleInfo>
<name type="personal">
<namePart type="given">Veronika</namePart>
<namePart type="family">Vincze</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Viktor</namePart>
<namePart type="family">Varga</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Katalin</namePart>
<namePart type="given">Ilona</namePart>
<namePart type="family">Simkó</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">János</namePart>
<namePart type="family">Zsibrita</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ágoston</namePart>
<namePart type="family">Nagy</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Richárd</namePart>
<namePart type="family">Farkas</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">János</namePart>
<namePart type="family">Csirik</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2014-may</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Ninth International Conference on Language Resources and Evaluation (LREC’14)</title>
</titleInfo>
<originInfo>
<publisher>European Language Resources Association (ELRA)</publisher>
<place>
<placeTerm type="text">Reykjavik, Iceland</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>The Szeged Corpus is the largest manually annotated database containing the possible morphological analyses and lemmas for each word form. In this work, we present its latest version, Szeged Corpus 2.5, in which the new harmonized morphological coding system of Hungarian has been employed and, on the other hand, the majority of misspelled words have been corrected and tagged with the proper morphological code. New morphological codes are introduced for participles, causative / modal / frequentative verbs, adverbial pronouns and punctuation marks, moreover, the distinction between common and proper nouns is eliminated. We also report some statistical data on the frequency of the new morphological codes. The new version of the corpus made it possible to train magyarlanc, a data-driven POS-tagger of Hungarian on a dataset with the new harmonized codes. According to the results, magyarlanc is able to achieve a state-of-the-art accuracy score on the 2.5 version as well.</abstract>
<identifier type="citekey">vincze-etal-2014-szeged</identifier>
<location>
<url>http://www.lrec-conf.org/proceedings/lrec2014/pdf/262_Paper.pdf</url>
</location>
<part>
<date>2014-may</date>
<extent unit="page">
<start>1074</start>
<end>1078</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Szeged Corpus 2.5: Morphological Modifications in a Manually POS-tagged Hungarian Corpus
%A Vincze, Veronika
%A Varga, Viktor
%A Simkó, Katalin Ilona
%A Zsibrita, János
%A Nagy, Ágoston
%A Farkas, Richárd
%A Csirik, János
%S Proceedings of the Ninth International Conference on Language Resources and Evaluation (LREC’14)
%D 2014
%8 may
%I European Language Resources Association (ELRA)
%C Reykjavik, Iceland
%F vincze-etal-2014-szeged
%X The Szeged Corpus is the largest manually annotated database containing the possible morphological analyses and lemmas for each word form. In this work, we present its latest version, Szeged Corpus 2.5, in which the new harmonized morphological coding system of Hungarian has been employed and, on the other hand, the majority of misspelled words have been corrected and tagged with the proper morphological code. New morphological codes are introduced for participles, causative / modal / frequentative verbs, adverbial pronouns and punctuation marks, moreover, the distinction between common and proper nouns is eliminated. We also report some statistical data on the frequency of the new morphological codes. The new version of the corpus made it possible to train magyarlanc, a data-driven POS-tagger of Hungarian on a dataset with the new harmonized codes. According to the results, magyarlanc is able to achieve a state-of-the-art accuracy score on the 2.5 version as well.
%U http://www.lrec-conf.org/proceedings/lrec2014/pdf/262_Paper.pdf
%P 1074-1078
Markdown (Informal)
[Szeged Corpus 2.5: Morphological Modifications in a Manually POS-tagged Hungarian Corpus](http://www.lrec-conf.org/proceedings/lrec2014/pdf/262_Paper.pdf) (Vincze et al., LREC 2014)
ACL