@inproceedings{ali-etal-2021-sipos-benchmark,
title = "{S}i{POS}: A Benchmark Dataset for {S}indhi Part-of-Speech Tagging",
author = "Ali, Wazir and
Xu, Zenglin and
Kumar, Jay",
editor = "Djabri, Souhila and
Gimadi, Dinara and
Mihaylova, Tsvetomila and
Nikolova-Koleva, Ivelina",
booktitle = "Proceedings of the Student Research Workshop Associated with RANLP 2021",
month = sep,
year = "2021",
address = "Online",
publisher = "INCOMA Ltd.",
url = "https://preview.aclanthology.org/add-emnlp-2024-awards/2021.ranlp-srw.4/",
pages = "22--30",
abstract = "In this paper, we introduce the SiPOS dataset for part-of-speech tagging in the low-resource Sindhi language with quality baselines. The dataset consists of more than 293K tokens annotated with sixteen universal part-of-speech categories. Two experienced native annotators annotated the SiPOS using the Doccano text annotation tool with an inter-annotation agreement of 0.872. We exploit the conditional random field, the popular bidirectional long-short-term memory neural model, and self-attention mechanism with various settings to evaluate the proposed dataset. Besides pre-trained GloVe and fastText representation, the character-level representations are incorporated to extract character-level information using the bidirectional long-short-term memory encoder. The high accuracy of 96.25{\%} is achieved with the task-specific joint word-level and character-level representations. The SiPOS dataset is likely to be a significant resource for the low-resource Sindhi language."
}
Markdown (Informal)
[SiPOS: A Benchmark Dataset for Sindhi Part-of-Speech Tagging](https://preview.aclanthology.org/add-emnlp-2024-awards/2021.ranlp-srw.4/) (Ali et al., RANLP 2021)
ACL