@inproceedings{samy-etal-2020-legal,
title = "Legal-{ES}: A Set of Large Scale Resources for {S}panish Legal Text Processing",
author = "Samy, Doaa and
Arenas-Garc{\'i}a, Jer{\'o}nimo and
P{\'e}rez-Fern{\'a}ndez, David",
editor = "Samy, Doaa and
P{\'e}rez-Fern{\'a}ndez, David and
Arenas-Garc{\'i}a, Jer{\'o}nimo",
booktitle = "Proceedings of the 1st Workshop on Language Technologies for Government and Public Administration (LT4Gov)",
month = may,
year = "2020",
address = "Marseille, France",
publisher = "European Language Resources Association",
url = "https://preview.aclanthology.org/jlcl-multiple-ingestion/2020.lt4gov-1.6/",
pages = "32--36",
language = "eng",
ISBN = "979-10-95546-62-7",
abstract = "Legal-ES is an open source resource kit for legal Spanish. It consists of a large scale Spanish corpus of open legal texts and different kinds of language models including word embeddings and topic models. The corpus includes over 1000 million words covering a collection of legislative and administrative open access documents in Spanish from different sources representing international, national and regional entities. The corpus is pre-processed and tokenized using Spacy. For the word embeddings, gensim was used on the collection of tokens, producing a representation space that is especially suited to reflect the inherent characteristics of the legal domain. We calculate also topic models to obtain a convenient tool to understand the main topics in the corpus and to navigate through the documents exploiting the semantic similarity among documents. We will analyse the time structure of a dynamic topic model to infer changes in the legal production of Spanish jurisdiction that have occurred over the analysed time framework."
}
Markdown (Informal)
[Legal-ES: A Set of Large Scale Resources for Spanish Legal Text Processing](https://preview.aclanthology.org/jlcl-multiple-ingestion/2020.lt4gov-1.6/) (Samy et al., LT4Gov 2020)
ACL