@article{mash-etal-2026-paraclean,
title = "{P}ara{CLEAN}: Improving Translation Quality through Systematic Parallel Data Cleaning",
author = "Mash, Audrey and
Bohman, Ella Paulina and
Melero, Maite",
editor = "Piperidis, Stelios and
Bel, N{\'u}ria and
van den Heuvel, Henk and
Ide, Nancy and
Krek, Simon and
Toral, Antonio",
journal = "International Conference on Language Resources and Evaluation",
volume = "main",
month = may,
year = "2026",
address = "Palma de Mallorca, Spain",
publisher = "ELRA Language Resource Association",
url = "https://preview.aclanthology.org/ingest-lrec/2026.lrec-main.527/",
pages = "6630--6640",
abstract = "Parallel corpora often contain significant noise, particularly in low-resource settings where both collected and synthetic data are combined. We present ParaCLEAN, a modular pipeline for cleaning parallel data that integrates embeddings-based filtering, language identification, deduplication, and normalisation. Experiments on Catalan to Japanese translation demonstrate that ParaCLEAN improves data quality and downstream MT performance. Ablation studies highlight the contribution of each step. ParaCLEAN is lightweight, reproducible, and extensible for diverse language pairs."
}Markdown (Informal)
[ParaCLEAN: Improving Translation Quality through Systematic Parallel Data Cleaning](https://preview.aclanthology.org/ingest-lrec/2026.lrec-main.527/) (Mash et al., LREC 2026)
ACL