@inproceedings{celano-2020-gradient,
title = "A Gradient Boosting-{S}eq2{S}eq System for {L}atin {POS} Tagging and Lemmatization",
author = "Celano, Giuseppe G. A.",
editor = "Sprugnoli, Rachele and
Passarotti, Marco",
booktitle = "Proceedings of LT4HALA 2020 - 1st Workshop on Language Technologies for Historical and Ancient Languages",
month = may,
year = "2020",
address = "Marseille, France",
publisher = "European Language Resources Association (ELRA)",
url = "https://preview.aclanthology.org/add-emnlp-2024-awards/2020.lt4hala-1.19/",
pages = "119--123",
language = "eng",
ISBN = "979-10-95546-53-5",
abstract = "The paper presents the system used in the EvaLatin shared task to POS tag and lemmatize Latin. It consists of two components. A gradient boosting machine (LightGBM) is used for POS tagging, mainly fed with pre-computed word embeddings of a window of seven contiguous tokens{---}the token at hand plus the three preceding and following ones{---}per target feature value. Word embeddings are trained on the texts of the Perseus Digital Library, Patrologia Latina, and Biblioteca Digitale di Testi Tardo Antichi, which together comprise a high number of texts of different genres from the Classical Age to Late Antiquity. Word forms plus the outputted POS labels are used to feed a seq2seq algorithm implemented in Keras to predict lemmas. The final shared-task accuracies measured for Classical Latin texts are in line with state-of-the-art POS taggers ({\ensuremath{\sim}}0.96) and lemmatizers ({\ensuremath{\sim}}0.95)."
}
Markdown (Informal)
[A Gradient Boosting-Seq2Seq System for Latin POS Tagging and Lemmatization](https://preview.aclanthology.org/add-emnlp-2024-awards/2020.lt4hala-1.19/) (Celano, LT4HALA 2020)
ACL