@inproceedings{prelevikj-zitnik-2021-multilingual,
title = "Multilingual Named Entity Recognition and Matching Using {BERT} and Dedupe for {S}lavic Languages",
author = "Prelevikj, Marko and
Zitnik, Slavko",
booktitle = "Proceedings of the 8th Workshop on Balto-Slavic Natural Language Processing",
month = apr,
year = "2021",
address = "Kiyv, Ukraine",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2021.bsnlp-1.9",
pages = "80--85",
abstract = "This paper describes the University of Ljubljana (UL FRI) Group{'}s submissions to the shared task at the Balto-Slavic Natural Language Processing (BSNLP) 2021 Workshop. We experiment with multiple BERT-based models, pre-trained in multi-lingual, Croatian-Slovene-English and Slovene-only data. We perform training iteratively and on the concatenated data of previously available NER datasets. For the normalization task we use Stanza lemmatizer, while for entity matching we implemented a baseline using the Dedupe library. The performance of evaluations suggests that multi-source settings outperform less-resourced approaches. The best NER models achieve 0.91 F-score on Slovene training data splits while the best official submission achieved F-scores of 0.84 and 0.78 for relaxed partial matching and strict settings, respectively. In multi-lingual NER setting we achieve F-scores of 0.82 and 0.74.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="prelevikj-zitnik-2021-multilingual">
<titleInfo>
<title>Multilingual Named Entity Recognition and Matching Using BERT and Dedupe for Slavic Languages</title>
</titleInfo>
<name type="personal">
<namePart type="given">Marko</namePart>
<namePart type="family">Prelevikj</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Slavko</namePart>
<namePart type="family">Zitnik</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021-apr</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 8th Workshop on Balto-Slavic Natural Language Processing</title>
</titleInfo>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Kiyv, Ukraine</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>This paper describes the University of Ljubljana (UL FRI) Group’s submissions to the shared task at the Balto-Slavic Natural Language Processing (BSNLP) 2021 Workshop. We experiment with multiple BERT-based models, pre-trained in multi-lingual, Croatian-Slovene-English and Slovene-only data. We perform training iteratively and on the concatenated data of previously available NER datasets. For the normalization task we use Stanza lemmatizer, while for entity matching we implemented a baseline using the Dedupe library. The performance of evaluations suggests that multi-source settings outperform less-resourced approaches. The best NER models achieve 0.91 F-score on Slovene training data splits while the best official submission achieved F-scores of 0.84 and 0.78 for relaxed partial matching and strict settings, respectively. In multi-lingual NER setting we achieve F-scores of 0.82 and 0.74.</abstract>
<identifier type="citekey">prelevikj-zitnik-2021-multilingual</identifier>
<location>
<url>https://aclanthology.org/2021.bsnlp-1.9</url>
</location>
<part>
<date>2021-apr</date>
<extent unit="page">
<start>80</start>
<end>85</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Multilingual Named Entity Recognition and Matching Using BERT and Dedupe for Slavic Languages
%A Prelevikj, Marko
%A Zitnik, Slavko
%S Proceedings of the 8th Workshop on Balto-Slavic Natural Language Processing
%D 2021
%8 apr
%I Association for Computational Linguistics
%C Kiyv, Ukraine
%F prelevikj-zitnik-2021-multilingual
%X This paper describes the University of Ljubljana (UL FRI) Group’s submissions to the shared task at the Balto-Slavic Natural Language Processing (BSNLP) 2021 Workshop. We experiment with multiple BERT-based models, pre-trained in multi-lingual, Croatian-Slovene-English and Slovene-only data. We perform training iteratively and on the concatenated data of previously available NER datasets. For the normalization task we use Stanza lemmatizer, while for entity matching we implemented a baseline using the Dedupe library. The performance of evaluations suggests that multi-source settings outperform less-resourced approaches. The best NER models achieve 0.91 F-score on Slovene training data splits while the best official submission achieved F-scores of 0.84 and 0.78 for relaxed partial matching and strict settings, respectively. In multi-lingual NER setting we achieve F-scores of 0.82 and 0.74.
%U https://aclanthology.org/2021.bsnlp-1.9
%P 80-85
Markdown (Informal)
[Multilingual Named Entity Recognition and Matching Using BERT and Dedupe for Slavic Languages](https://aclanthology.org/2021.bsnlp-1.9) (Prelevikj & Zitnik, BSNLP 2021)
ACL