@inproceedings{canals-raventos-tato-2021-t4t,
title = "{T}4{T} Solution: {WMT}21 Similar Language Task for the {S}panish-{C}atalan and {S}panish-{P}ortuguese Language Pair",
author = "Canals, Miguel and
Ravent{\'o}s Tato, Marc",
editor = "Barrault, Loic and
Bojar, Ondrej and
Bougares, Fethi and
Chatterjee, Rajen and
Costa-jussa, Marta R. and
Federmann, Christian and
Fishel, Mark and
Fraser, Alexander and
Freitag, Markus and
Graham, Yvette and
Grundkiewicz, Roman and
Guzman, Paco and
Haddow, Barry and
Huck, Matthias and
Yepes, Antonio Jimeno and
Koehn, Philipp and
Kocmi, Tom and
Martins, Andre and
Morishita, Makoto and
Monz, Christof",
booktitle = "Proceedings of the Sixth Conference on Machine Translation",
month = nov,
year = "2021",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest_wac_2008/2021.wmt-1.28/",
pages = "279--283",
abstract = "The main idea of this solution has been to focus on corpus cleaning and preparation and after that, use an out of box solution (OpenNMT) with its default published transformer model. To prepare the corpus, we have used set of standard tools (as Moses scripts or python packages), but also, among other python scripts, a python custom tokenizer with the ability to replace numbers for variables, solve the upper/lower case issue of the vocabulary and provide good segmentation for most of the punctuation. We also have started a line to clean corpus based on statistical probability estimation of source-target corpus, with unclear results. Also, we have run some tests with syllabical word segmentation, again with unclear results, so at the end, after word sentence tokenization we have used BPE SentencePiece for subword units to feed OpenNMT."
}