@inproceedings{domotor-etal-2025-variety,
title = "Variety delights (sometimes) - Annotation differences in morphologically annotated corpora",
author = {D{\"o}m{\"o}t{\"o}r, Andrea and
Indig, Bal{\'a}zs and
Nemeskey, D{\'a}vid M{\'a}rk},
editor = "Peng, Siyao and
Rehbein, Ines",
booktitle = "Proceedings of the 19th Linguistic Annotation Workshop (LAW-XIX-2025)",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/corrections-2025-08/2025.law-1.22/",
doi = "10.18653/v1/2025.law-1.22",
pages = "270--278",
ISBN = "979-8-89176-262-6",
abstract = "The goal of annotation standards is to ensure consistency across different corpora and languages. But do they succeed? In our paper we experiment with morphologically annotated Hungarian corpora of different sizes (ELTE DH gold standard corpus, NYTK-NerKor, and Szeged Treebank) to assess their compatibility as a merged training corpus for morphological analysis and disambiguation. Our results show that combining any two corpora not only failed to improve the results of the trained tagger but even degraded them due the inconsistent annotations. Further analysis of the annotation differences among the corpora revealed inconsistencies of several sources: different theoretical approach, lack of consensus, and tagset conversion issues."
}
Markdown (Informal)
[Variety delights (sometimes) - Annotation differences in morphologically annotated corpora](https://preview.aclanthology.org/corrections-2025-08/2025.law-1.22/) (Dömötör et al., LAW 2025)
ACL