@inproceedings{bustamante-etal-2020-data,
title = "No Data to Crawl? Monolingual Corpus Creation from {PDF} Files of Truly low-Resource Languages in {P}eru",
author = "Bustamante, Gina and
Oncevay, Arturo and
Zariquiey, Roberto",
editor = "Calzolari, Nicoletta and
B{\'e}chet, Fr{\'e}d{\'e}ric and
Blache, Philippe and
Choukri, Khalid and
Cieri, Christopher and
Declerck, Thierry and
Goggi, Sara and
Isahara, Hitoshi and
Maegaard, Bente and
Mariani, Joseph and
Mazo, H{\'e}l{\`e}ne and
Moreno, Asuncion and
Odijk, Jan and
Piperidis, Stelios",
booktitle = "Proceedings of the Twelfth Language Resources and Evaluation Conference",
month = may,
year = "2020",
address = "Marseille, France",
publisher = "European Language Resources Association",
url = "https://preview.aclanthology.org/fix-sig-urls/2020.lrec-1.356/",
pages = "2914--2923",
language = "eng",
ISBN = "979-10-95546-34-4",
abstract = "We introduce new monolingual corpora for four indigenous and endangered languages from Peru: Shipibo-konibo, Ashaninka, Yanesha and Yine. Given the total absence of these languages in the web, the extraction and processing of texts from PDF files is relevant in a truly low-resource language scenario. Our procedure for monolingual corpus creation considers language-specific and language-agnostic steps, and focuses on educational PDF files with multilingual sentences, noisy pages and low-structured content. Through an evaluation based on language modelling and character-level perplexity on a subset of manually extracted sentences, we determine that our method allows the creation of clean corpora for the four languages, a key resource for natural language processing tasks nowadays."
}
Markdown (Informal)
[No Data to Crawl? Monolingual Corpus Creation from PDF Files of Truly low-Resource Languages in Peru](https://preview.aclanthology.org/fix-sig-urls/2020.lrec-1.356/) (Bustamante et al., LREC 2020)
ACL