@inproceedings{viksna-skadina-2024-multileg,
title = "{M}ulti{L}eg: Dataset for Text Sanitisation in Less-resourced Languages",
author = "V{\={i}}ksna, Rinalds and
Skadi{\c{n}}a, Inguna",
editor = "Calzolari, Nicoletta and
Kan, Min-Yen and
Hoste, Veronique and
Lenci, Alessandro and
Sakti, Sakriani and
Xue, Nianwen",
booktitle = "Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)",
month = may,
year = "2024",
address = "Torino, Italia",
publisher = "ELRA and ICCL",
url = "https://preview.aclanthology.org/fix-sig-urls/2024.lrec-main.1028/",
pages = "11776--11782",
abstract = "Text sanitization is the task of detecting and removing personal information from the text. While it has been well-studied in monolingual settings, today, there is also a need for multilingual text sanitization. In this paper, we introduce MultiLeg: a parallel, multilingual named entity (NE) dataset consisting of documents from the Court of Justice of the European Union annotated with semantic categories suitable for text sanitization. The dataset is available in 8 languages, and it contains 3082 parallel text segments for each language. We also show that the pseudonymized dataset remains useful for downstream tasks."
}
Markdown (Informal)
[MultiLeg: Dataset for Text Sanitisation in Less-resourced Languages](https://preview.aclanthology.org/fix-sig-urls/2024.lrec-main.1028/) (Vīksna & Skadiņa, LREC-COLING 2024)
ACL