@inproceedings{hammerla-etal-2025-standardizing,
title = "Standardizing Heterogeneous Corpora with {DUUR}: A Dual Data- and Process-Oriented Approach to Enhancing {NLP} Pipeline Integration",
author = "Hammerla, Leon Lukas and
Mehler, Alexander and
Abrami, Giuseppe",
editor = "Inui, Kentaro and
Sakti, Sakriani and
Wang, Haofen and
Wong, Derek F. and
Bhattacharyya, Pushpak and
Banerjee, Biplab and
Ekbal, Asif and
Chakraborty, Tanmoy and
Singh, Dhirendra Pratap",
booktitle = "Proceedings of the 14th International Joint Conference on Natural Language Processing and the 4th Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics",
month = dec,
year = "2025",
address = "Mumbai, India",
publisher = "The Asian Federation of Natural Language Processing and The Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-ijcnlp-aacl/2025.findings-ijcnlp.87/",
pages = "1410--1425",
ISBN = "979-8-89176-303-6",
abstract = "Despite their success, LLMs are too computationally expensive to replace task- or domain-specific NLP systems. However, the variety of corpus formats makes reusing these systems difficult. This underscores the importance of maintaining an interoperable NLP landscape. We address this challenge by pursuing two objectives: standardizing corpus formats and enabling massively parallel corpus processing. We present a unified conversion framework embedded in a massively parallel, microservice-based, programming language-independent NLP architecture designed for modularity and extensibility. It allows for the integration of external NLP conversion tools and supports the addition of new components that meet basic compatibility requirements. To evaluate our dual data- and process-oriented approach to standardization, we (1) benchmark its efficiency in terms of processing speed and memory usage, (2) demonstrate the benefits of standardized corpus formats for NLP downstream tasks, and (3) illustrate the advantages of incorporating custom formats into a corpus format ecosystem."
}Markdown (Informal)
[Standardizing Heterogeneous Corpora with DUUR: A Dual Data- and Process-Oriented Approach to Enhancing NLP Pipeline Integration](https://preview.aclanthology.org/ingest-ijcnlp-aacl/2025.findings-ijcnlp.87/) (Hammerla et al., Findings 2025)
ACL