@inproceedings{janssen-seifart-2025-searchable,
title = "Searchable Language Documentation Corpora: {D}o{R}e{C}o meets {TEITOK}",
author = "Janssen, Maarten and
Seifart, Frank",
editor = "Le Ferrand, {\'E}ric and
Klyachko, Elena and
Postnikova, Anna and
Shavrina, Tatiana and
Serikov, Oleg and
Voloshina, Ekaterina and
Vylomova, Ekaterina",
booktitle = "Proceedings of the Fourth Workshop on NLP Applications to Field Linguistics",
month = aug,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/corrections-2025-08/2025.fieldmatters-1.5/",
pages = "58--64",
ISBN = "979-8-89176-282-4",
abstract = "In this paper, we describe a newly created searchable interface for DoReCo, a database that contains spoken corpora from a world-wide sample of 53, mostly lesser described languages, with audio, transcription, translation, and - for most languages - interlinear morpheme glosses. Until now, DoReCo data were available for download via the DoReCo website and via the Nakala repository in a number of different formats, but not directly accessible online. We created a graphical interface to view, listen to, and search these data online, providing direct and intuitive access for linguists and laypeople. The new interface uses the TEITOK corpus infrastructure to provide a number of different visualizations on individual documents in DoReCo and provides a search interface to perform detailed searches on individual languages. The use of TEITOK also enables the corpus for use with NLP pipelines, either using the data to train NLP models, or to use NLP models to enrich the data."
}
Markdown (Informal)
[Searchable Language Documentation Corpora: DoReCo meets TEITOK](https://preview.aclanthology.org/corrections-2025-08/2025.fieldmatters-1.5/) (Janssen & Seifart, FieldMatters 2025)
ACL