@inproceedings{granroth-wilding-2020-pimlico,
title = "Pimlico: A toolkit for corpus-processing pipelines and reproducible experiments",
author = "Granroth-Wilding, Mark",
editor = "Park, Eunjeong L. and
Hagiwara, Masato and
Milajevs, Dmitrijs and
Liu, Nelson F. and
Chauhan, Geeticka and
Tan, Liling",
booktitle = "Proceedings of Second Workshop for NLP Open Source Software (NLP-OSS)",
month = nov,
year = "2020",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/jlcl-multiple-ingestion/2020.nlposs-1.14/",
doi = "10.18653/v1/2020.nlposs-1.14",
pages = "101--109",
abstract = "We present Pimlico, an open source toolkit for building pipelines for processing large corpora. It is especially focused on processing linguistic corpora and provides wrappers around existing, widely used NLP tools. A particular goal is to ease distribution of reproducible and extensible experiments by making it easy to document and re-run all steps involved, including data loading, pre-processing, model training and evaluation. Once a pipeline is released, it is easy to adapt, for example, to run on a new dataset, or to re-run an experiment with different parameters. The toolkit takes care of many common challenges in writing and distributing corpus-processing code, such as managing data between the steps of a pipeline, installing required software and combining existing toolkits with new, task-specific code."
}
Markdown (Informal)
[Pimlico: A toolkit for corpus-processing pipelines and reproducible experiments](https://preview.aclanthology.org/jlcl-multiple-ingestion/2020.nlposs-1.14/) (Granroth-Wilding, NLPOSS 2020)
ACL