@inproceedings{ali-etal-2020-siner,
title = "{S}i{NER}: A Large Dataset for {S}indhi Named Entity Recognition",
author = "Ali, Wazir and
Lu, Junyu and
Xu, Zenglin",
editor = "Calzolari, Nicoletta and
B{\'e}chet, Fr{\'e}d{\'e}ric and
Blache, Philippe and
Choukri, Khalid and
Cieri, Christopher and
Declerck, Thierry and
Goggi, Sara and
Isahara, Hitoshi and
Maegaard, Bente and
Mariani, Joseph and
Mazo, H{\'e}l{\`e}ne and
Moreno, Asuncion and
Odijk, Jan and
Piperidis, Stelios",
booktitle = "Proceedings of the Twelfth Language Resources and Evaluation Conference",
month = may,
year = "2020",
address = "Marseille, France",
publisher = "European Language Resources Association",
url = "https://preview.aclanthology.org/jlcl-multiple-ingestion/2020.lrec-1.361/",
pages = "2953--2961",
language = "eng",
ISBN = "979-10-95546-34-4",
abstract = "We introduce the SiNER: a named entity recognition (NER) dataset for low-resourced Sindhi language with quality baselines. It contains 1,338 news articles and more than 1.35 million tokens collected from Kawish and Awami Awaz Sindhi newspapers using the begin-inside-outside (BIO) tagging scheme. The proposed dataset is likely to be a significant resource for statistical Sindhi language processing. The ultimate goal of developing SiNER is to present a gold-standard dataset for Sindhi NER along with quality baselines. We implement several baseline approaches of conditional random field (CRF) and recent popular state-of-the-art bi-directional long-short term memory (Bi-LSTM) models. The promising F1-score of 89.16 outputted by the Bi-LSTM-CRF model with character-level representations demonstrates the quality of our proposed SiNER dataset."
}
Markdown (Informal)
[SiNER: A Large Dataset for Sindhi Named Entity Recognition](https://preview.aclanthology.org/jlcl-multiple-ingestion/2020.lrec-1.361/) (Ali et al., LREC 2020)
ACL